altimate-code 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/README.md +1 -5
- package/bin/altimate +6 -0
- package/bin/altimate-code +6 -0
- package/dbt-tools/bin/altimate-dbt +2 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/__init__.py +0 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/fetch_schema.py +35 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/utils.py +353 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/validate_sql.py +114 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__init__.py +178 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__main__.py +96 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/_typing.py +17 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/__init__.py +3 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/__init__.py +18 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/_typing.py +18 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/column.py +332 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/dataframe.py +866 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/functions.py +1267 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/group.py +59 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/normalize.py +78 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/operations.py +53 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/readwriter.py +108 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/session.py +190 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/transforms.py +9 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/types.py +212 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/util.py +32 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/window.py +134 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/__init__.py +118 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/athena.py +166 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/bigquery.py +1331 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/clickhouse.py +1393 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/databricks.py +131 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dialect.py +1915 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/doris.py +561 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/drill.py +157 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/druid.py +20 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/duckdb.py +1159 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dune.py +16 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/hive.py +787 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/materialize.py +94 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/mysql.py +1324 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/oracle.py +378 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/postgres.py +778 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/presto.py +788 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/prql.py +203 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/redshift.py +448 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/risingwave.py +78 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/snowflake.py +1464 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark.py +202 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark2.py +349 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/sqlite.py +320 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/starrocks.py +343 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tableau.py +61 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/teradata.py +356 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/trino.py +115 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tsql.py +1403 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/diff.py +456 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/errors.py +93 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/__init__.py +95 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/context.py +101 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/env.py +246 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/python.py +460 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/table.py +155 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/expressions.py +8870 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/generator.py +4993 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/helper.py +582 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/jsonpath.py +227 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/lineage.py +423 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/__init__.py +11 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/annotate_types.py +589 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/canonicalize.py +222 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_ctes.py +43 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_joins.py +181 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_subqueries.py +189 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/isolate_table_selects.py +50 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/merge_subqueries.py +415 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize.py +200 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize_identifiers.py +64 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimize_joins.py +91 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimizer.py +94 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_predicates.py +222 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_projections.py +172 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify.py +104 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_columns.py +1024 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_tables.py +155 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/scope.py +904 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/simplify.py +1587 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/unnest_subqueries.py +302 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/parser.py +8501 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/planner.py +463 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/schema.py +588 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/serde.py +68 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/time.py +687 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/tokens.py +1520 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/transforms.py +1020 -0
- package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/trie.py +81 -0
- package/dbt-tools/dist/altimate_python_packages/dbt_core_integration.py +825 -0
- package/dbt-tools/dist/altimate_python_packages/dbt_utils.py +157 -0
- package/dbt-tools/dist/index.js +23859 -0
- package/package.json +13 -13
- package/postinstall.mjs +42 -0
- package/skills/altimate-setup/SKILL.md +31 -0
|
@@ -0,0 +1,1520 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import typing as t
|
|
5
|
+
from enum import auto
|
|
6
|
+
|
|
7
|
+
from sqlglot.errors import SqlglotError, TokenError
|
|
8
|
+
from sqlglot.helper import AutoName
|
|
9
|
+
from sqlglot.trie import TrieResult, in_trie, new_trie
|
|
10
|
+
|
|
11
|
+
if t.TYPE_CHECKING:
|
|
12
|
+
from sqlglot.dialects.dialect import DialectType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from sqlglotrs import ( # type: ignore
|
|
17
|
+
Tokenizer as RsTokenizer,
|
|
18
|
+
TokenizerDialectSettings as RsTokenizerDialectSettings,
|
|
19
|
+
TokenizerSettings as RsTokenizerSettings,
|
|
20
|
+
TokenTypeSettings as RsTokenTypeSettings,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
USE_RS_TOKENIZER = os.environ.get("SQLGLOTRS_TOKENIZER", "1") == "1"
|
|
24
|
+
except ImportError:
|
|
25
|
+
USE_RS_TOKENIZER = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TokenType(AutoName):
|
|
29
|
+
L_PAREN = auto()
|
|
30
|
+
R_PAREN = auto()
|
|
31
|
+
L_BRACKET = auto()
|
|
32
|
+
R_BRACKET = auto()
|
|
33
|
+
L_BRACE = auto()
|
|
34
|
+
R_BRACE = auto()
|
|
35
|
+
COMMA = auto()
|
|
36
|
+
DOT = auto()
|
|
37
|
+
DASH = auto()
|
|
38
|
+
PLUS = auto()
|
|
39
|
+
COLON = auto()
|
|
40
|
+
DOTCOLON = auto()
|
|
41
|
+
DCOLON = auto()
|
|
42
|
+
DQMARK = auto()
|
|
43
|
+
SEMICOLON = auto()
|
|
44
|
+
STAR = auto()
|
|
45
|
+
BACKSLASH = auto()
|
|
46
|
+
SLASH = auto()
|
|
47
|
+
LT = auto()
|
|
48
|
+
LTE = auto()
|
|
49
|
+
GT = auto()
|
|
50
|
+
GTE = auto()
|
|
51
|
+
NOT = auto()
|
|
52
|
+
EQ = auto()
|
|
53
|
+
NEQ = auto()
|
|
54
|
+
NULLSAFE_EQ = auto()
|
|
55
|
+
COLON_EQ = auto()
|
|
56
|
+
AND = auto()
|
|
57
|
+
OR = auto()
|
|
58
|
+
AMP = auto()
|
|
59
|
+
DPIPE = auto()
|
|
60
|
+
PIPE_GT = auto()
|
|
61
|
+
PIPE = auto()
|
|
62
|
+
PIPE_SLASH = auto()
|
|
63
|
+
DPIPE_SLASH = auto()
|
|
64
|
+
CARET = auto()
|
|
65
|
+
CARET_AT = auto()
|
|
66
|
+
TILDA = auto()
|
|
67
|
+
ARROW = auto()
|
|
68
|
+
DARROW = auto()
|
|
69
|
+
FARROW = auto()
|
|
70
|
+
HASH = auto()
|
|
71
|
+
HASH_ARROW = auto()
|
|
72
|
+
DHASH_ARROW = auto()
|
|
73
|
+
LR_ARROW = auto()
|
|
74
|
+
DAT = auto()
|
|
75
|
+
LT_AT = auto()
|
|
76
|
+
AT_GT = auto()
|
|
77
|
+
DOLLAR = auto()
|
|
78
|
+
PARAMETER = auto()
|
|
79
|
+
SESSION_PARAMETER = auto()
|
|
80
|
+
DAMP = auto()
|
|
81
|
+
XOR = auto()
|
|
82
|
+
DSTAR = auto()
|
|
83
|
+
|
|
84
|
+
URI_START = auto()
|
|
85
|
+
|
|
86
|
+
BLOCK_START = auto()
|
|
87
|
+
BLOCK_END = auto()
|
|
88
|
+
|
|
89
|
+
SPACE = auto()
|
|
90
|
+
BREAK = auto()
|
|
91
|
+
|
|
92
|
+
STRING = auto()
|
|
93
|
+
NUMBER = auto()
|
|
94
|
+
IDENTIFIER = auto()
|
|
95
|
+
DATABASE = auto()
|
|
96
|
+
COLUMN = auto()
|
|
97
|
+
COLUMN_DEF = auto()
|
|
98
|
+
SCHEMA = auto()
|
|
99
|
+
TABLE = auto()
|
|
100
|
+
WAREHOUSE = auto()
|
|
101
|
+
STAGE = auto()
|
|
102
|
+
STREAMLIT = auto()
|
|
103
|
+
VAR = auto()
|
|
104
|
+
BIT_STRING = auto()
|
|
105
|
+
HEX_STRING = auto()
|
|
106
|
+
BYTE_STRING = auto()
|
|
107
|
+
NATIONAL_STRING = auto()
|
|
108
|
+
RAW_STRING = auto()
|
|
109
|
+
HEREDOC_STRING = auto()
|
|
110
|
+
UNICODE_STRING = auto()
|
|
111
|
+
|
|
112
|
+
# types
|
|
113
|
+
BIT = auto()
|
|
114
|
+
BOOLEAN = auto()
|
|
115
|
+
TINYINT = auto()
|
|
116
|
+
UTINYINT = auto()
|
|
117
|
+
SMALLINT = auto()
|
|
118
|
+
USMALLINT = auto()
|
|
119
|
+
MEDIUMINT = auto()
|
|
120
|
+
UMEDIUMINT = auto()
|
|
121
|
+
INT = auto()
|
|
122
|
+
UINT = auto()
|
|
123
|
+
BIGINT = auto()
|
|
124
|
+
UBIGINT = auto()
|
|
125
|
+
INT128 = auto()
|
|
126
|
+
UINT128 = auto()
|
|
127
|
+
INT256 = auto()
|
|
128
|
+
UINT256 = auto()
|
|
129
|
+
FLOAT = auto()
|
|
130
|
+
DOUBLE = auto()
|
|
131
|
+
UDOUBLE = auto()
|
|
132
|
+
DECIMAL = auto()
|
|
133
|
+
DECIMAL32 = auto()
|
|
134
|
+
DECIMAL64 = auto()
|
|
135
|
+
DECIMAL128 = auto()
|
|
136
|
+
DECIMAL256 = auto()
|
|
137
|
+
UDECIMAL = auto()
|
|
138
|
+
BIGDECIMAL = auto()
|
|
139
|
+
CHAR = auto()
|
|
140
|
+
NCHAR = auto()
|
|
141
|
+
VARCHAR = auto()
|
|
142
|
+
NVARCHAR = auto()
|
|
143
|
+
BPCHAR = auto()
|
|
144
|
+
TEXT = auto()
|
|
145
|
+
MEDIUMTEXT = auto()
|
|
146
|
+
LONGTEXT = auto()
|
|
147
|
+
BLOB = auto()
|
|
148
|
+
MEDIUMBLOB = auto()
|
|
149
|
+
LONGBLOB = auto()
|
|
150
|
+
TINYBLOB = auto()
|
|
151
|
+
TINYTEXT = auto()
|
|
152
|
+
NAME = auto()
|
|
153
|
+
BINARY = auto()
|
|
154
|
+
VARBINARY = auto()
|
|
155
|
+
JSON = auto()
|
|
156
|
+
JSONB = auto()
|
|
157
|
+
TIME = auto()
|
|
158
|
+
TIMETZ = auto()
|
|
159
|
+
TIMESTAMP = auto()
|
|
160
|
+
TIMESTAMPTZ = auto()
|
|
161
|
+
TIMESTAMPLTZ = auto()
|
|
162
|
+
TIMESTAMPNTZ = auto()
|
|
163
|
+
TIMESTAMP_S = auto()
|
|
164
|
+
TIMESTAMP_MS = auto()
|
|
165
|
+
TIMESTAMP_NS = auto()
|
|
166
|
+
DATETIME = auto()
|
|
167
|
+
DATETIME2 = auto()
|
|
168
|
+
DATETIME64 = auto()
|
|
169
|
+
SMALLDATETIME = auto()
|
|
170
|
+
DATE = auto()
|
|
171
|
+
DATE32 = auto()
|
|
172
|
+
INT4RANGE = auto()
|
|
173
|
+
INT4MULTIRANGE = auto()
|
|
174
|
+
INT8RANGE = auto()
|
|
175
|
+
INT8MULTIRANGE = auto()
|
|
176
|
+
NUMRANGE = auto()
|
|
177
|
+
NUMMULTIRANGE = auto()
|
|
178
|
+
TSRANGE = auto()
|
|
179
|
+
TSMULTIRANGE = auto()
|
|
180
|
+
TSTZRANGE = auto()
|
|
181
|
+
TSTZMULTIRANGE = auto()
|
|
182
|
+
DATERANGE = auto()
|
|
183
|
+
DATEMULTIRANGE = auto()
|
|
184
|
+
UUID = auto()
|
|
185
|
+
GEOGRAPHY = auto()
|
|
186
|
+
NULLABLE = auto()
|
|
187
|
+
GEOMETRY = auto()
|
|
188
|
+
POINT = auto()
|
|
189
|
+
RING = auto()
|
|
190
|
+
LINESTRING = auto()
|
|
191
|
+
MULTILINESTRING = auto()
|
|
192
|
+
POLYGON = auto()
|
|
193
|
+
MULTIPOLYGON = auto()
|
|
194
|
+
HLLSKETCH = auto()
|
|
195
|
+
HSTORE = auto()
|
|
196
|
+
SUPER = auto()
|
|
197
|
+
SERIAL = auto()
|
|
198
|
+
SMALLSERIAL = auto()
|
|
199
|
+
BIGSERIAL = auto()
|
|
200
|
+
XML = auto()
|
|
201
|
+
YEAR = auto()
|
|
202
|
+
USERDEFINED = auto()
|
|
203
|
+
MONEY = auto()
|
|
204
|
+
SMALLMONEY = auto()
|
|
205
|
+
ROWVERSION = auto()
|
|
206
|
+
IMAGE = auto()
|
|
207
|
+
VARIANT = auto()
|
|
208
|
+
OBJECT = auto()
|
|
209
|
+
INET = auto()
|
|
210
|
+
IPADDRESS = auto()
|
|
211
|
+
IPPREFIX = auto()
|
|
212
|
+
IPV4 = auto()
|
|
213
|
+
IPV6 = auto()
|
|
214
|
+
ENUM = auto()
|
|
215
|
+
ENUM8 = auto()
|
|
216
|
+
ENUM16 = auto()
|
|
217
|
+
FIXEDSTRING = auto()
|
|
218
|
+
LOWCARDINALITY = auto()
|
|
219
|
+
NESTED = auto()
|
|
220
|
+
AGGREGATEFUNCTION = auto()
|
|
221
|
+
SIMPLEAGGREGATEFUNCTION = auto()
|
|
222
|
+
TDIGEST = auto()
|
|
223
|
+
UNKNOWN = auto()
|
|
224
|
+
VECTOR = auto()
|
|
225
|
+
DYNAMIC = auto()
|
|
226
|
+
VOID = auto()
|
|
227
|
+
|
|
228
|
+
# keywords
|
|
229
|
+
ALIAS = auto()
|
|
230
|
+
ALTER = auto()
|
|
231
|
+
ALWAYS = auto()
|
|
232
|
+
ALL = auto()
|
|
233
|
+
ANTI = auto()
|
|
234
|
+
ANY = auto()
|
|
235
|
+
APPLY = auto()
|
|
236
|
+
ARRAY = auto()
|
|
237
|
+
ASC = auto()
|
|
238
|
+
ASOF = auto()
|
|
239
|
+
ATTACH = auto()
|
|
240
|
+
AUTO_INCREMENT = auto()
|
|
241
|
+
BEGIN = auto()
|
|
242
|
+
BETWEEN = auto()
|
|
243
|
+
BULK_COLLECT_INTO = auto()
|
|
244
|
+
CACHE = auto()
|
|
245
|
+
CASE = auto()
|
|
246
|
+
CHARACTER_SET = auto()
|
|
247
|
+
CLUSTER_BY = auto()
|
|
248
|
+
COLLATE = auto()
|
|
249
|
+
COMMAND = auto()
|
|
250
|
+
COMMENT = auto()
|
|
251
|
+
COMMIT = auto()
|
|
252
|
+
CONNECT_BY = auto()
|
|
253
|
+
CONSTRAINT = auto()
|
|
254
|
+
COPY = auto()
|
|
255
|
+
CREATE = auto()
|
|
256
|
+
CROSS = auto()
|
|
257
|
+
CUBE = auto()
|
|
258
|
+
CURRENT_DATE = auto()
|
|
259
|
+
CURRENT_DATETIME = auto()
|
|
260
|
+
CURRENT_SCHEMA = auto()
|
|
261
|
+
CURRENT_TIME = auto()
|
|
262
|
+
CURRENT_TIMESTAMP = auto()
|
|
263
|
+
CURRENT_USER = auto()
|
|
264
|
+
DECLARE = auto()
|
|
265
|
+
DEFAULT = auto()
|
|
266
|
+
DELETE = auto()
|
|
267
|
+
DESC = auto()
|
|
268
|
+
DESCRIBE = auto()
|
|
269
|
+
DETACH = auto()
|
|
270
|
+
DICTIONARY = auto()
|
|
271
|
+
DISTINCT = auto()
|
|
272
|
+
DISTRIBUTE_BY = auto()
|
|
273
|
+
DIV = auto()
|
|
274
|
+
DROP = auto()
|
|
275
|
+
ELSE = auto()
|
|
276
|
+
END = auto()
|
|
277
|
+
ESCAPE = auto()
|
|
278
|
+
EXCEPT = auto()
|
|
279
|
+
EXECUTE = auto()
|
|
280
|
+
EXISTS = auto()
|
|
281
|
+
FALSE = auto()
|
|
282
|
+
FETCH = auto()
|
|
283
|
+
FILE_FORMAT = auto()
|
|
284
|
+
FILTER = auto()
|
|
285
|
+
FINAL = auto()
|
|
286
|
+
FIRST = auto()
|
|
287
|
+
FOR = auto()
|
|
288
|
+
FORCE = auto()
|
|
289
|
+
FOREIGN_KEY = auto()
|
|
290
|
+
FORMAT = auto()
|
|
291
|
+
FROM = auto()
|
|
292
|
+
FULL = auto()
|
|
293
|
+
FUNCTION = auto()
|
|
294
|
+
GET = auto()
|
|
295
|
+
GLOB = auto()
|
|
296
|
+
GLOBAL = auto()
|
|
297
|
+
GRANT = auto()
|
|
298
|
+
GROUP_BY = auto()
|
|
299
|
+
GROUPING_SETS = auto()
|
|
300
|
+
HAVING = auto()
|
|
301
|
+
HINT = auto()
|
|
302
|
+
IGNORE = auto()
|
|
303
|
+
ILIKE = auto()
|
|
304
|
+
ILIKE_ANY = auto()
|
|
305
|
+
IN = auto()
|
|
306
|
+
INDEX = auto()
|
|
307
|
+
INNER = auto()
|
|
308
|
+
INSERT = auto()
|
|
309
|
+
INTERSECT = auto()
|
|
310
|
+
INTERVAL = auto()
|
|
311
|
+
INTO = auto()
|
|
312
|
+
INTRODUCER = auto()
|
|
313
|
+
IRLIKE = auto()
|
|
314
|
+
IS = auto()
|
|
315
|
+
ISNULL = auto()
|
|
316
|
+
JOIN = auto()
|
|
317
|
+
JOIN_MARKER = auto()
|
|
318
|
+
KEEP = auto()
|
|
319
|
+
KEY = auto()
|
|
320
|
+
KILL = auto()
|
|
321
|
+
LANGUAGE = auto()
|
|
322
|
+
LATERAL = auto()
|
|
323
|
+
LEFT = auto()
|
|
324
|
+
LIKE = auto()
|
|
325
|
+
LIKE_ANY = auto()
|
|
326
|
+
LIMIT = auto()
|
|
327
|
+
LIST = auto()
|
|
328
|
+
LOAD = auto()
|
|
329
|
+
LOCK = auto()
|
|
330
|
+
MAP = auto()
|
|
331
|
+
MATCH_CONDITION = auto()
|
|
332
|
+
MATCH_RECOGNIZE = auto()
|
|
333
|
+
MEMBER_OF = auto()
|
|
334
|
+
MERGE = auto()
|
|
335
|
+
MOD = auto()
|
|
336
|
+
MODEL = auto()
|
|
337
|
+
NATURAL = auto()
|
|
338
|
+
NEXT = auto()
|
|
339
|
+
NOTHING = auto()
|
|
340
|
+
NOTNULL = auto()
|
|
341
|
+
NULL = auto()
|
|
342
|
+
OBJECT_IDENTIFIER = auto()
|
|
343
|
+
OFFSET = auto()
|
|
344
|
+
ON = auto()
|
|
345
|
+
ONLY = auto()
|
|
346
|
+
OPERATOR = auto()
|
|
347
|
+
ORDER_BY = auto()
|
|
348
|
+
ORDER_SIBLINGS_BY = auto()
|
|
349
|
+
ORDERED = auto()
|
|
350
|
+
ORDINALITY = auto()
|
|
351
|
+
OUTER = auto()
|
|
352
|
+
OVER = auto()
|
|
353
|
+
OVERLAPS = auto()
|
|
354
|
+
OVERWRITE = auto()
|
|
355
|
+
PARTITION = auto()
|
|
356
|
+
PARTITION_BY = auto()
|
|
357
|
+
PERCENT = auto()
|
|
358
|
+
PIVOT = auto()
|
|
359
|
+
PLACEHOLDER = auto()
|
|
360
|
+
POSITIONAL = auto()
|
|
361
|
+
PRAGMA = auto()
|
|
362
|
+
PREWHERE = auto()
|
|
363
|
+
PRIMARY_KEY = auto()
|
|
364
|
+
PROCEDURE = auto()
|
|
365
|
+
PROPERTIES = auto()
|
|
366
|
+
PSEUDO_TYPE = auto()
|
|
367
|
+
PUT = auto()
|
|
368
|
+
QUALIFY = auto()
|
|
369
|
+
QUOTE = auto()
|
|
370
|
+
RANGE = auto()
|
|
371
|
+
RECURSIVE = auto()
|
|
372
|
+
REFRESH = auto()
|
|
373
|
+
RENAME = auto()
|
|
374
|
+
REPLACE = auto()
|
|
375
|
+
RETURNING = auto()
|
|
376
|
+
REFERENCES = auto()
|
|
377
|
+
RIGHT = auto()
|
|
378
|
+
RLIKE = auto()
|
|
379
|
+
ROLLBACK = auto()
|
|
380
|
+
ROLLUP = auto()
|
|
381
|
+
ROW = auto()
|
|
382
|
+
ROWS = auto()
|
|
383
|
+
SELECT = auto()
|
|
384
|
+
SEMI = auto()
|
|
385
|
+
SEPARATOR = auto()
|
|
386
|
+
SEQUENCE = auto()
|
|
387
|
+
SERDE_PROPERTIES = auto()
|
|
388
|
+
SET = auto()
|
|
389
|
+
SETTINGS = auto()
|
|
390
|
+
SHOW = auto()
|
|
391
|
+
SIMILAR_TO = auto()
|
|
392
|
+
SOME = auto()
|
|
393
|
+
SORT_BY = auto()
|
|
394
|
+
START_WITH = auto()
|
|
395
|
+
STORAGE_INTEGRATION = auto()
|
|
396
|
+
STRAIGHT_JOIN = auto()
|
|
397
|
+
STRUCT = auto()
|
|
398
|
+
SUMMARIZE = auto()
|
|
399
|
+
TABLE_SAMPLE = auto()
|
|
400
|
+
TAG = auto()
|
|
401
|
+
TEMPORARY = auto()
|
|
402
|
+
TOP = auto()
|
|
403
|
+
THEN = auto()
|
|
404
|
+
TRUE = auto()
|
|
405
|
+
TRUNCATE = auto()
|
|
406
|
+
UNCACHE = auto()
|
|
407
|
+
UNION = auto()
|
|
408
|
+
UNNEST = auto()
|
|
409
|
+
UNPIVOT = auto()
|
|
410
|
+
UPDATE = auto()
|
|
411
|
+
USE = auto()
|
|
412
|
+
USING = auto()
|
|
413
|
+
VALUES = auto()
|
|
414
|
+
VIEW = auto()
|
|
415
|
+
VOLATILE = auto()
|
|
416
|
+
WHEN = auto()
|
|
417
|
+
WHERE = auto()
|
|
418
|
+
WINDOW = auto()
|
|
419
|
+
WITH = auto()
|
|
420
|
+
UNIQUE = auto()
|
|
421
|
+
VERSION_SNAPSHOT = auto()
|
|
422
|
+
TIMESTAMP_SNAPSHOT = auto()
|
|
423
|
+
OPTION = auto()
|
|
424
|
+
SINK = auto()
|
|
425
|
+
SOURCE = auto()
|
|
426
|
+
ANALYZE = auto()
|
|
427
|
+
NAMESPACE = auto()
|
|
428
|
+
EXPORT = auto()
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
_ALL_TOKEN_TYPES = list(TokenType)
|
|
432
|
+
_TOKEN_TYPE_TO_INDEX = {token_type: i for i, token_type in enumerate(_ALL_TOKEN_TYPES)}
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class Token:
|
|
436
|
+
__slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
|
|
437
|
+
|
|
438
|
+
@classmethod
|
|
439
|
+
def number(cls, number: int) -> Token:
|
|
440
|
+
"""Returns a NUMBER token with `number` as its text."""
|
|
441
|
+
return cls(TokenType.NUMBER, str(number))
|
|
442
|
+
|
|
443
|
+
@classmethod
|
|
444
|
+
def string(cls, string: str) -> Token:
|
|
445
|
+
"""Returns a STRING token with `string` as its text."""
|
|
446
|
+
return cls(TokenType.STRING, string)
|
|
447
|
+
|
|
448
|
+
@classmethod
|
|
449
|
+
def identifier(cls, identifier: str) -> Token:
|
|
450
|
+
"""Returns an IDENTIFIER token with `identifier` as its text."""
|
|
451
|
+
return cls(TokenType.IDENTIFIER, identifier)
|
|
452
|
+
|
|
453
|
+
@classmethod
|
|
454
|
+
def var(cls, var: str) -> Token:
|
|
455
|
+
"""Returns an VAR token with `var` as its text."""
|
|
456
|
+
return cls(TokenType.VAR, var)
|
|
457
|
+
|
|
458
|
+
def __init__(
|
|
459
|
+
self,
|
|
460
|
+
token_type: TokenType,
|
|
461
|
+
text: str,
|
|
462
|
+
line: int = 1,
|
|
463
|
+
col: int = 1,
|
|
464
|
+
start: int = 0,
|
|
465
|
+
end: int = 0,
|
|
466
|
+
comments: t.Optional[t.List[str]] = None,
|
|
467
|
+
) -> None:
|
|
468
|
+
"""Token initializer.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
token_type: The TokenType Enum.
|
|
472
|
+
text: The text of the token.
|
|
473
|
+
line: The line that the token ends on.
|
|
474
|
+
col: The column that the token ends on.
|
|
475
|
+
start: The start index of the token.
|
|
476
|
+
end: The ending index of the token.
|
|
477
|
+
comments: The comments to attach to the token.
|
|
478
|
+
"""
|
|
479
|
+
self.token_type = token_type
|
|
480
|
+
self.text = text
|
|
481
|
+
self.line = line
|
|
482
|
+
self.col = col
|
|
483
|
+
self.start = start
|
|
484
|
+
self.end = end
|
|
485
|
+
self.comments = [] if comments is None else comments
|
|
486
|
+
|
|
487
|
+
def __repr__(self) -> str:
|
|
488
|
+
attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
|
|
489
|
+
return f"<Token {attributes}>"
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class _Tokenizer(type):
|
|
493
|
+
def __new__(cls, clsname, bases, attrs):
|
|
494
|
+
klass = super().__new__(cls, clsname, bases, attrs)
|
|
495
|
+
|
|
496
|
+
def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
|
|
497
|
+
return dict(
|
|
498
|
+
(item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
def _quotes_to_format(
|
|
502
|
+
token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
|
|
503
|
+
) -> t.Dict[str, t.Tuple[str, TokenType]]:
|
|
504
|
+
return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
|
|
505
|
+
|
|
506
|
+
klass._QUOTES = _convert_quotes(klass.QUOTES)
|
|
507
|
+
klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS)
|
|
508
|
+
|
|
509
|
+
klass._FORMAT_STRINGS = {
|
|
510
|
+
**{
|
|
511
|
+
p + s: (e, TokenType.NATIONAL_STRING)
|
|
512
|
+
for s, e in klass._QUOTES.items()
|
|
513
|
+
for p in ("n", "N")
|
|
514
|
+
},
|
|
515
|
+
**_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS),
|
|
516
|
+
**_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS),
|
|
517
|
+
**_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS),
|
|
518
|
+
**_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS),
|
|
519
|
+
**_quotes_to_format(TokenType.HEREDOC_STRING, klass.HEREDOC_STRINGS),
|
|
520
|
+
**_quotes_to_format(TokenType.UNICODE_STRING, klass.UNICODE_STRINGS),
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
|
|
524
|
+
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
|
|
525
|
+
klass._COMMENTS = {
|
|
526
|
+
**dict(
|
|
527
|
+
(comment, None) if isinstance(comment, str) else (comment[0], comment[1])
|
|
528
|
+
for comment in klass.COMMENTS
|
|
529
|
+
),
|
|
530
|
+
"{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects
|
|
531
|
+
}
|
|
532
|
+
if klass.HINT_START in klass.KEYWORDS:
|
|
533
|
+
klass._COMMENTS[klass.HINT_START] = "*/"
|
|
534
|
+
|
|
535
|
+
klass._KEYWORD_TRIE = new_trie(
|
|
536
|
+
key.upper()
|
|
537
|
+
for key in (
|
|
538
|
+
*klass.KEYWORDS,
|
|
539
|
+
*klass._COMMENTS,
|
|
540
|
+
*klass._QUOTES,
|
|
541
|
+
*klass._FORMAT_STRINGS,
|
|
542
|
+
)
|
|
543
|
+
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
if USE_RS_TOKENIZER:
|
|
547
|
+
settings = RsTokenizerSettings(
|
|
548
|
+
white_space={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.WHITE_SPACE.items()},
|
|
549
|
+
single_tokens={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.SINGLE_TOKENS.items()},
|
|
550
|
+
keywords={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.KEYWORDS.items()},
|
|
551
|
+
numeric_literals=klass.NUMERIC_LITERALS,
|
|
552
|
+
identifiers=klass._IDENTIFIERS,
|
|
553
|
+
identifier_escapes=klass._IDENTIFIER_ESCAPES,
|
|
554
|
+
string_escapes=klass._STRING_ESCAPES,
|
|
555
|
+
quotes=klass._QUOTES,
|
|
556
|
+
format_strings={
|
|
557
|
+
k: (v1, _TOKEN_TYPE_TO_INDEX[v2])
|
|
558
|
+
for k, (v1, v2) in klass._FORMAT_STRINGS.items()
|
|
559
|
+
},
|
|
560
|
+
has_bit_strings=bool(klass.BIT_STRINGS),
|
|
561
|
+
has_hex_strings=bool(klass.HEX_STRINGS),
|
|
562
|
+
comments=klass._COMMENTS,
|
|
563
|
+
var_single_tokens=klass.VAR_SINGLE_TOKENS,
|
|
564
|
+
commands={_TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMANDS},
|
|
565
|
+
command_prefix_tokens={
|
|
566
|
+
_TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMAND_PREFIX_TOKENS
|
|
567
|
+
},
|
|
568
|
+
heredoc_tag_is_identifier=klass.HEREDOC_TAG_IS_IDENTIFIER,
|
|
569
|
+
string_escapes_allowed_in_raw_strings=klass.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
|
|
570
|
+
nested_comments=klass.NESTED_COMMENTS,
|
|
571
|
+
hint_start=klass.HINT_START,
|
|
572
|
+
tokens_preceding_hint={
|
|
573
|
+
_TOKEN_TYPE_TO_INDEX[v] for v in klass.TOKENS_PRECEDING_HINT
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
token_types = RsTokenTypeSettings(
|
|
577
|
+
bit_string=_TOKEN_TYPE_TO_INDEX[TokenType.BIT_STRING],
|
|
578
|
+
break_=_TOKEN_TYPE_TO_INDEX[TokenType.BREAK],
|
|
579
|
+
dcolon=_TOKEN_TYPE_TO_INDEX[TokenType.DCOLON],
|
|
580
|
+
heredoc_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEREDOC_STRING],
|
|
581
|
+
raw_string=_TOKEN_TYPE_TO_INDEX[TokenType.RAW_STRING],
|
|
582
|
+
hex_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEX_STRING],
|
|
583
|
+
identifier=_TOKEN_TYPE_TO_INDEX[TokenType.IDENTIFIER],
|
|
584
|
+
number=_TOKEN_TYPE_TO_INDEX[TokenType.NUMBER],
|
|
585
|
+
parameter=_TOKEN_TYPE_TO_INDEX[TokenType.PARAMETER],
|
|
586
|
+
semicolon=_TOKEN_TYPE_TO_INDEX[TokenType.SEMICOLON],
|
|
587
|
+
string=_TOKEN_TYPE_TO_INDEX[TokenType.STRING],
|
|
588
|
+
var=_TOKEN_TYPE_TO_INDEX[TokenType.VAR],
|
|
589
|
+
heredoc_string_alternative=_TOKEN_TYPE_TO_INDEX[klass.HEREDOC_STRING_ALTERNATIVE],
|
|
590
|
+
hint=_TOKEN_TYPE_TO_INDEX[TokenType.HINT],
|
|
591
|
+
)
|
|
592
|
+
klass._RS_TOKENIZER = RsTokenizer(settings, token_types)
|
|
593
|
+
else:
|
|
594
|
+
klass._RS_TOKENIZER = None
|
|
595
|
+
|
|
596
|
+
return klass
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
class Tokenizer(metaclass=_Tokenizer):
|
|
600
|
+
SINGLE_TOKENS = {
|
|
601
|
+
"(": TokenType.L_PAREN,
|
|
602
|
+
")": TokenType.R_PAREN,
|
|
603
|
+
"[": TokenType.L_BRACKET,
|
|
604
|
+
"]": TokenType.R_BRACKET,
|
|
605
|
+
"{": TokenType.L_BRACE,
|
|
606
|
+
"}": TokenType.R_BRACE,
|
|
607
|
+
"&": TokenType.AMP,
|
|
608
|
+
"^": TokenType.CARET,
|
|
609
|
+
":": TokenType.COLON,
|
|
610
|
+
",": TokenType.COMMA,
|
|
611
|
+
".": TokenType.DOT,
|
|
612
|
+
"-": TokenType.DASH,
|
|
613
|
+
"=": TokenType.EQ,
|
|
614
|
+
">": TokenType.GT,
|
|
615
|
+
"<": TokenType.LT,
|
|
616
|
+
"%": TokenType.MOD,
|
|
617
|
+
"!": TokenType.NOT,
|
|
618
|
+
"|": TokenType.PIPE,
|
|
619
|
+
"+": TokenType.PLUS,
|
|
620
|
+
";": TokenType.SEMICOLON,
|
|
621
|
+
"/": TokenType.SLASH,
|
|
622
|
+
"\\": TokenType.BACKSLASH,
|
|
623
|
+
"*": TokenType.STAR,
|
|
624
|
+
"~": TokenType.TILDA,
|
|
625
|
+
"?": TokenType.PLACEHOLDER,
|
|
626
|
+
"@": TokenType.PARAMETER,
|
|
627
|
+
"#": TokenType.HASH,
|
|
628
|
+
# Used for breaking a var like x'y' but nothing else the token type doesn't matter
|
|
629
|
+
"'": TokenType.UNKNOWN,
|
|
630
|
+
"`": TokenType.UNKNOWN,
|
|
631
|
+
'"': TokenType.UNKNOWN,
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
635
|
+
BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
636
|
+
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
637
|
+
RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
638
|
+
HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
639
|
+
UNICODE_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
640
|
+
IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
|
|
641
|
+
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
|
|
642
|
+
STRING_ESCAPES = ["'"]
|
|
643
|
+
VAR_SINGLE_TOKENS: t.Set[str] = set()
|
|
644
|
+
|
|
645
|
+
# The strings in this list can always be used as escapes, regardless of the surrounding
|
|
646
|
+
# identifier delimiters. By default, the closing delimiter is assumed to also act as an
|
|
647
|
+
# identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
|
|
648
|
+
IDENTIFIER_ESCAPES: t.List[str] = []
|
|
649
|
+
|
|
650
|
+
# Whether the heredoc tags follow the same lexical rules as unquoted identifiers
|
|
651
|
+
HEREDOC_TAG_IS_IDENTIFIER = False
|
|
652
|
+
|
|
653
|
+
# Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
|
|
654
|
+
HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
|
|
655
|
+
|
|
656
|
+
# Whether string escape characters function as such when placed within raw strings
|
|
657
|
+
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
|
|
658
|
+
|
|
659
|
+
NESTED_COMMENTS = True
|
|
660
|
+
|
|
661
|
+
HINT_START = "/*+"
|
|
662
|
+
|
|
663
|
+
TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
|
|
664
|
+
|
|
665
|
+
# Autofilled
|
|
666
|
+
_COMMENTS: t.Dict[str, str] = {}
|
|
667
|
+
_FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
|
|
668
|
+
_IDENTIFIERS: t.Dict[str, str] = {}
|
|
669
|
+
_IDENTIFIER_ESCAPES: t.Set[str] = set()
|
|
670
|
+
_QUOTES: t.Dict[str, str] = {}
|
|
671
|
+
_STRING_ESCAPES: t.Set[str] = set()
|
|
672
|
+
_KEYWORD_TRIE: t.Dict = {}
|
|
673
|
+
_RS_TOKENIZER: t.Optional[t.Any] = None
|
|
674
|
+
|
|
675
|
+
KEYWORDS: t.Dict[str, TokenType] = {
|
|
676
|
+
**{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
|
|
677
|
+
**{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
|
|
678
|
+
**{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
|
|
679
|
+
**{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
|
|
680
|
+
HINT_START: TokenType.HINT,
|
|
681
|
+
"==": TokenType.EQ,
|
|
682
|
+
"::": TokenType.DCOLON,
|
|
683
|
+
"||": TokenType.DPIPE,
|
|
684
|
+
"|>": TokenType.PIPE_GT,
|
|
685
|
+
">=": TokenType.GTE,
|
|
686
|
+
"<=": TokenType.LTE,
|
|
687
|
+
"<>": TokenType.NEQ,
|
|
688
|
+
"!=": TokenType.NEQ,
|
|
689
|
+
":=": TokenType.COLON_EQ,
|
|
690
|
+
"<=>": TokenType.NULLSAFE_EQ,
|
|
691
|
+
"->": TokenType.ARROW,
|
|
692
|
+
"->>": TokenType.DARROW,
|
|
693
|
+
"=>": TokenType.FARROW,
|
|
694
|
+
"#>": TokenType.HASH_ARROW,
|
|
695
|
+
"#>>": TokenType.DHASH_ARROW,
|
|
696
|
+
"<->": TokenType.LR_ARROW,
|
|
697
|
+
"&&": TokenType.DAMP,
|
|
698
|
+
"??": TokenType.DQMARK,
|
|
699
|
+
"~~~": TokenType.GLOB,
|
|
700
|
+
"~~": TokenType.LIKE,
|
|
701
|
+
"~~*": TokenType.ILIKE,
|
|
702
|
+
"~*": TokenType.IRLIKE,
|
|
703
|
+
"ALL": TokenType.ALL,
|
|
704
|
+
"ALWAYS": TokenType.ALWAYS,
|
|
705
|
+
"AND": TokenType.AND,
|
|
706
|
+
"ANTI": TokenType.ANTI,
|
|
707
|
+
"ANY": TokenType.ANY,
|
|
708
|
+
"ASC": TokenType.ASC,
|
|
709
|
+
"AS": TokenType.ALIAS,
|
|
710
|
+
"ASOF": TokenType.ASOF,
|
|
711
|
+
"AUTOINCREMENT": TokenType.AUTO_INCREMENT,
|
|
712
|
+
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
|
|
713
|
+
"BEGIN": TokenType.BEGIN,
|
|
714
|
+
"BETWEEN": TokenType.BETWEEN,
|
|
715
|
+
"CACHE": TokenType.CACHE,
|
|
716
|
+
"UNCACHE": TokenType.UNCACHE,
|
|
717
|
+
"CASE": TokenType.CASE,
|
|
718
|
+
"CHARACTER SET": TokenType.CHARACTER_SET,
|
|
719
|
+
"CLUSTER BY": TokenType.CLUSTER_BY,
|
|
720
|
+
"COLLATE": TokenType.COLLATE,
|
|
721
|
+
"COLUMN": TokenType.COLUMN,
|
|
722
|
+
"COMMIT": TokenType.COMMIT,
|
|
723
|
+
"CONNECT BY": TokenType.CONNECT_BY,
|
|
724
|
+
"CONSTRAINT": TokenType.CONSTRAINT,
|
|
725
|
+
"COPY": TokenType.COPY,
|
|
726
|
+
"CREATE": TokenType.CREATE,
|
|
727
|
+
"CROSS": TokenType.CROSS,
|
|
728
|
+
"CUBE": TokenType.CUBE,
|
|
729
|
+
"CURRENT_DATE": TokenType.CURRENT_DATE,
|
|
730
|
+
"CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
|
|
731
|
+
"CURRENT_TIME": TokenType.CURRENT_TIME,
|
|
732
|
+
"CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
|
|
733
|
+
"CURRENT_USER": TokenType.CURRENT_USER,
|
|
734
|
+
"DATABASE": TokenType.DATABASE,
|
|
735
|
+
"DEFAULT": TokenType.DEFAULT,
|
|
736
|
+
"DELETE": TokenType.DELETE,
|
|
737
|
+
"DESC": TokenType.DESC,
|
|
738
|
+
"DESCRIBE": TokenType.DESCRIBE,
|
|
739
|
+
"DISTINCT": TokenType.DISTINCT,
|
|
740
|
+
"DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
|
|
741
|
+
"DIV": TokenType.DIV,
|
|
742
|
+
"DROP": TokenType.DROP,
|
|
743
|
+
"ELSE": TokenType.ELSE,
|
|
744
|
+
"END": TokenType.END,
|
|
745
|
+
"ENUM": TokenType.ENUM,
|
|
746
|
+
"ESCAPE": TokenType.ESCAPE,
|
|
747
|
+
"EXCEPT": TokenType.EXCEPT,
|
|
748
|
+
"EXECUTE": TokenType.EXECUTE,
|
|
749
|
+
"EXISTS": TokenType.EXISTS,
|
|
750
|
+
"FALSE": TokenType.FALSE,
|
|
751
|
+
"FETCH": TokenType.FETCH,
|
|
752
|
+
"FILTER": TokenType.FILTER,
|
|
753
|
+
"FIRST": TokenType.FIRST,
|
|
754
|
+
"FULL": TokenType.FULL,
|
|
755
|
+
"FUNCTION": TokenType.FUNCTION,
|
|
756
|
+
"FOR": TokenType.FOR,
|
|
757
|
+
"FOREIGN KEY": TokenType.FOREIGN_KEY,
|
|
758
|
+
"FORMAT": TokenType.FORMAT,
|
|
759
|
+
"FROM": TokenType.FROM,
|
|
760
|
+
"GEOGRAPHY": TokenType.GEOGRAPHY,
|
|
761
|
+
"GEOMETRY": TokenType.GEOMETRY,
|
|
762
|
+
"GLOB": TokenType.GLOB,
|
|
763
|
+
"GROUP BY": TokenType.GROUP_BY,
|
|
764
|
+
"GROUPING SETS": TokenType.GROUPING_SETS,
|
|
765
|
+
"HAVING": TokenType.HAVING,
|
|
766
|
+
"ILIKE": TokenType.ILIKE,
|
|
767
|
+
"IN": TokenType.IN,
|
|
768
|
+
"INDEX": TokenType.INDEX,
|
|
769
|
+
"INET": TokenType.INET,
|
|
770
|
+
"INNER": TokenType.INNER,
|
|
771
|
+
"INSERT": TokenType.INSERT,
|
|
772
|
+
"INTERVAL": TokenType.INTERVAL,
|
|
773
|
+
"INTERSECT": TokenType.INTERSECT,
|
|
774
|
+
"INTO": TokenType.INTO,
|
|
775
|
+
"IS": TokenType.IS,
|
|
776
|
+
"ISNULL": TokenType.ISNULL,
|
|
777
|
+
"JOIN": TokenType.JOIN,
|
|
778
|
+
"KEEP": TokenType.KEEP,
|
|
779
|
+
"KILL": TokenType.KILL,
|
|
780
|
+
"LATERAL": TokenType.LATERAL,
|
|
781
|
+
"LEFT": TokenType.LEFT,
|
|
782
|
+
"LIKE": TokenType.LIKE,
|
|
783
|
+
"LIMIT": TokenType.LIMIT,
|
|
784
|
+
"LOAD": TokenType.LOAD,
|
|
785
|
+
"LOCK": TokenType.LOCK,
|
|
786
|
+
"MERGE": TokenType.MERGE,
|
|
787
|
+
"NAMESPACE": TokenType.NAMESPACE,
|
|
788
|
+
"NATURAL": TokenType.NATURAL,
|
|
789
|
+
"NEXT": TokenType.NEXT,
|
|
790
|
+
"NOT": TokenType.NOT,
|
|
791
|
+
"NOTNULL": TokenType.NOTNULL,
|
|
792
|
+
"NULL": TokenType.NULL,
|
|
793
|
+
"OBJECT": TokenType.OBJECT,
|
|
794
|
+
"OFFSET": TokenType.OFFSET,
|
|
795
|
+
"ON": TokenType.ON,
|
|
796
|
+
"OR": TokenType.OR,
|
|
797
|
+
"XOR": TokenType.XOR,
|
|
798
|
+
"ORDER BY": TokenType.ORDER_BY,
|
|
799
|
+
"ORDINALITY": TokenType.ORDINALITY,
|
|
800
|
+
"OUTER": TokenType.OUTER,
|
|
801
|
+
"OVER": TokenType.OVER,
|
|
802
|
+
"OVERLAPS": TokenType.OVERLAPS,
|
|
803
|
+
"OVERWRITE": TokenType.OVERWRITE,
|
|
804
|
+
"PARTITION": TokenType.PARTITION,
|
|
805
|
+
"PARTITION BY": TokenType.PARTITION_BY,
|
|
806
|
+
"PARTITIONED BY": TokenType.PARTITION_BY,
|
|
807
|
+
"PARTITIONED_BY": TokenType.PARTITION_BY,
|
|
808
|
+
"PERCENT": TokenType.PERCENT,
|
|
809
|
+
"PIVOT": TokenType.PIVOT,
|
|
810
|
+
"PRAGMA": TokenType.PRAGMA,
|
|
811
|
+
"PRIMARY KEY": TokenType.PRIMARY_KEY,
|
|
812
|
+
"PROCEDURE": TokenType.PROCEDURE,
|
|
813
|
+
"QUALIFY": TokenType.QUALIFY,
|
|
814
|
+
"RANGE": TokenType.RANGE,
|
|
815
|
+
"RECURSIVE": TokenType.RECURSIVE,
|
|
816
|
+
"REGEXP": TokenType.RLIKE,
|
|
817
|
+
"RENAME": TokenType.RENAME,
|
|
818
|
+
"REPLACE": TokenType.REPLACE,
|
|
819
|
+
"RETURNING": TokenType.RETURNING,
|
|
820
|
+
"REFERENCES": TokenType.REFERENCES,
|
|
821
|
+
"RIGHT": TokenType.RIGHT,
|
|
822
|
+
"RLIKE": TokenType.RLIKE,
|
|
823
|
+
"ROLLBACK": TokenType.ROLLBACK,
|
|
824
|
+
"ROLLUP": TokenType.ROLLUP,
|
|
825
|
+
"ROW": TokenType.ROW,
|
|
826
|
+
"ROWS": TokenType.ROWS,
|
|
827
|
+
"SCHEMA": TokenType.SCHEMA,
|
|
828
|
+
"SELECT": TokenType.SELECT,
|
|
829
|
+
"SEMI": TokenType.SEMI,
|
|
830
|
+
"SET": TokenType.SET,
|
|
831
|
+
"SETTINGS": TokenType.SETTINGS,
|
|
832
|
+
"SHOW": TokenType.SHOW,
|
|
833
|
+
"SIMILAR TO": TokenType.SIMILAR_TO,
|
|
834
|
+
"SOME": TokenType.SOME,
|
|
835
|
+
"SORT BY": TokenType.SORT_BY,
|
|
836
|
+
"START WITH": TokenType.START_WITH,
|
|
837
|
+
"STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
|
|
838
|
+
"TABLE": TokenType.TABLE,
|
|
839
|
+
"TABLESAMPLE": TokenType.TABLE_SAMPLE,
|
|
840
|
+
"TEMP": TokenType.TEMPORARY,
|
|
841
|
+
"TEMPORARY": TokenType.TEMPORARY,
|
|
842
|
+
"THEN": TokenType.THEN,
|
|
843
|
+
"TRUE": TokenType.TRUE,
|
|
844
|
+
"TRUNCATE": TokenType.TRUNCATE,
|
|
845
|
+
"UNION": TokenType.UNION,
|
|
846
|
+
"UNKNOWN": TokenType.UNKNOWN,
|
|
847
|
+
"UNNEST": TokenType.UNNEST,
|
|
848
|
+
"UNPIVOT": TokenType.UNPIVOT,
|
|
849
|
+
"UPDATE": TokenType.UPDATE,
|
|
850
|
+
"USE": TokenType.USE,
|
|
851
|
+
"USING": TokenType.USING,
|
|
852
|
+
"UUID": TokenType.UUID,
|
|
853
|
+
"VALUES": TokenType.VALUES,
|
|
854
|
+
"VIEW": TokenType.VIEW,
|
|
855
|
+
"VOLATILE": TokenType.VOLATILE,
|
|
856
|
+
"WHEN": TokenType.WHEN,
|
|
857
|
+
"WHERE": TokenType.WHERE,
|
|
858
|
+
"WINDOW": TokenType.WINDOW,
|
|
859
|
+
"WITH": TokenType.WITH,
|
|
860
|
+
"APPLY": TokenType.APPLY,
|
|
861
|
+
"ARRAY": TokenType.ARRAY,
|
|
862
|
+
"BIT": TokenType.BIT,
|
|
863
|
+
"BOOL": TokenType.BOOLEAN,
|
|
864
|
+
"BOOLEAN": TokenType.BOOLEAN,
|
|
865
|
+
"BYTE": TokenType.TINYINT,
|
|
866
|
+
"MEDIUMINT": TokenType.MEDIUMINT,
|
|
867
|
+
"INT1": TokenType.TINYINT,
|
|
868
|
+
"TINYINT": TokenType.TINYINT,
|
|
869
|
+
"INT16": TokenType.SMALLINT,
|
|
870
|
+
"SHORT": TokenType.SMALLINT,
|
|
871
|
+
"SMALLINT": TokenType.SMALLINT,
|
|
872
|
+
"HUGEINT": TokenType.INT128,
|
|
873
|
+
"UHUGEINT": TokenType.UINT128,
|
|
874
|
+
"INT2": TokenType.SMALLINT,
|
|
875
|
+
"INTEGER": TokenType.INT,
|
|
876
|
+
"INT": TokenType.INT,
|
|
877
|
+
"INT4": TokenType.INT,
|
|
878
|
+
"INT32": TokenType.INT,
|
|
879
|
+
"INT64": TokenType.BIGINT,
|
|
880
|
+
"INT128": TokenType.INT128,
|
|
881
|
+
"INT256": TokenType.INT256,
|
|
882
|
+
"LONG": TokenType.BIGINT,
|
|
883
|
+
"BIGINT": TokenType.BIGINT,
|
|
884
|
+
"INT8": TokenType.TINYINT,
|
|
885
|
+
"UINT": TokenType.UINT,
|
|
886
|
+
"UINT128": TokenType.UINT128,
|
|
887
|
+
"UINT256": TokenType.UINT256,
|
|
888
|
+
"DEC": TokenType.DECIMAL,
|
|
889
|
+
"DECIMAL": TokenType.DECIMAL,
|
|
890
|
+
"DECIMAL32": TokenType.DECIMAL32,
|
|
891
|
+
"DECIMAL64": TokenType.DECIMAL64,
|
|
892
|
+
"DECIMAL128": TokenType.DECIMAL128,
|
|
893
|
+
"DECIMAL256": TokenType.DECIMAL256,
|
|
894
|
+
"BIGDECIMAL": TokenType.BIGDECIMAL,
|
|
895
|
+
"BIGNUMERIC": TokenType.BIGDECIMAL,
|
|
896
|
+
"LIST": TokenType.LIST,
|
|
897
|
+
"MAP": TokenType.MAP,
|
|
898
|
+
"NULLABLE": TokenType.NULLABLE,
|
|
899
|
+
"NUMBER": TokenType.DECIMAL,
|
|
900
|
+
"NUMERIC": TokenType.DECIMAL,
|
|
901
|
+
"FIXED": TokenType.DECIMAL,
|
|
902
|
+
"REAL": TokenType.FLOAT,
|
|
903
|
+
"FLOAT": TokenType.FLOAT,
|
|
904
|
+
"FLOAT4": TokenType.FLOAT,
|
|
905
|
+
"FLOAT8": TokenType.DOUBLE,
|
|
906
|
+
"DOUBLE": TokenType.DOUBLE,
|
|
907
|
+
"DOUBLE PRECISION": TokenType.DOUBLE,
|
|
908
|
+
"JSON": TokenType.JSON,
|
|
909
|
+
"JSONB": TokenType.JSONB,
|
|
910
|
+
"CHAR": TokenType.CHAR,
|
|
911
|
+
"CHARACTER": TokenType.CHAR,
|
|
912
|
+
"CHAR VARYING": TokenType.VARCHAR,
|
|
913
|
+
"CHARACTER VARYING": TokenType.VARCHAR,
|
|
914
|
+
"NCHAR": TokenType.NCHAR,
|
|
915
|
+
"VARCHAR": TokenType.VARCHAR,
|
|
916
|
+
"VARCHAR2": TokenType.VARCHAR,
|
|
917
|
+
"NVARCHAR": TokenType.NVARCHAR,
|
|
918
|
+
"NVARCHAR2": TokenType.NVARCHAR,
|
|
919
|
+
"BPCHAR": TokenType.BPCHAR,
|
|
920
|
+
"STR": TokenType.TEXT,
|
|
921
|
+
"STRING": TokenType.TEXT,
|
|
922
|
+
"TEXT": TokenType.TEXT,
|
|
923
|
+
"LONGTEXT": TokenType.LONGTEXT,
|
|
924
|
+
"MEDIUMTEXT": TokenType.MEDIUMTEXT,
|
|
925
|
+
"TINYTEXT": TokenType.TINYTEXT,
|
|
926
|
+
"CLOB": TokenType.TEXT,
|
|
927
|
+
"LONGVARCHAR": TokenType.TEXT,
|
|
928
|
+
"BINARY": TokenType.BINARY,
|
|
929
|
+
"BLOB": TokenType.VARBINARY,
|
|
930
|
+
"LONGBLOB": TokenType.LONGBLOB,
|
|
931
|
+
"MEDIUMBLOB": TokenType.MEDIUMBLOB,
|
|
932
|
+
"TINYBLOB": TokenType.TINYBLOB,
|
|
933
|
+
"BYTEA": TokenType.VARBINARY,
|
|
934
|
+
"VARBINARY": TokenType.VARBINARY,
|
|
935
|
+
"TIME": TokenType.TIME,
|
|
936
|
+
"TIMETZ": TokenType.TIMETZ,
|
|
937
|
+
"TIMESTAMP": TokenType.TIMESTAMP,
|
|
938
|
+
"TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
|
|
939
|
+
"TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
|
|
940
|
+
"TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
|
|
941
|
+
"TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
|
|
942
|
+
"TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
|
|
943
|
+
"DATE": TokenType.DATE,
|
|
944
|
+
"DATETIME": TokenType.DATETIME,
|
|
945
|
+
"INT4RANGE": TokenType.INT4RANGE,
|
|
946
|
+
"INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
|
|
947
|
+
"INT8RANGE": TokenType.INT8RANGE,
|
|
948
|
+
"INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
|
|
949
|
+
"NUMRANGE": TokenType.NUMRANGE,
|
|
950
|
+
"NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
|
|
951
|
+
"TSRANGE": TokenType.TSRANGE,
|
|
952
|
+
"TSMULTIRANGE": TokenType.TSMULTIRANGE,
|
|
953
|
+
"TSTZRANGE": TokenType.TSTZRANGE,
|
|
954
|
+
"TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
|
|
955
|
+
"DATERANGE": TokenType.DATERANGE,
|
|
956
|
+
"DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
|
|
957
|
+
"UNIQUE": TokenType.UNIQUE,
|
|
958
|
+
"VECTOR": TokenType.VECTOR,
|
|
959
|
+
"STRUCT": TokenType.STRUCT,
|
|
960
|
+
"SEQUENCE": TokenType.SEQUENCE,
|
|
961
|
+
"VARIANT": TokenType.VARIANT,
|
|
962
|
+
"ALTER": TokenType.ALTER,
|
|
963
|
+
"ANALYZE": TokenType.ANALYZE,
|
|
964
|
+
"CALL": TokenType.COMMAND,
|
|
965
|
+
"COMMENT": TokenType.COMMENT,
|
|
966
|
+
"EXPLAIN": TokenType.COMMAND,
|
|
967
|
+
"GRANT": TokenType.GRANT,
|
|
968
|
+
"OPTIMIZE": TokenType.COMMAND,
|
|
969
|
+
"PREPARE": TokenType.COMMAND,
|
|
970
|
+
"VACUUM": TokenType.COMMAND,
|
|
971
|
+
"USER-DEFINED": TokenType.USERDEFINED,
|
|
972
|
+
"FOR VERSION": TokenType.VERSION_SNAPSHOT,
|
|
973
|
+
"FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
|
|
977
|
+
" ": TokenType.SPACE,
|
|
978
|
+
"\t": TokenType.SPACE,
|
|
979
|
+
"\n": TokenType.BREAK,
|
|
980
|
+
"\r": TokenType.BREAK,
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
COMMANDS = {
|
|
984
|
+
TokenType.COMMAND,
|
|
985
|
+
TokenType.EXECUTE,
|
|
986
|
+
TokenType.FETCH,
|
|
987
|
+
TokenType.SHOW,
|
|
988
|
+
TokenType.RENAME,
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
|
|
992
|
+
|
|
993
|
+
# Handle numeric literals like in hive (3L = BIGINT)
|
|
994
|
+
NUMERIC_LITERALS: t.Dict[str, str] = {}
|
|
995
|
+
|
|
996
|
+
COMMENTS = ["--", ("/*", "*/")]
|
|
997
|
+
|
|
998
|
+
__slots__ = (
|
|
999
|
+
"sql",
|
|
1000
|
+
"size",
|
|
1001
|
+
"tokens",
|
|
1002
|
+
"dialect",
|
|
1003
|
+
"use_rs_tokenizer",
|
|
1004
|
+
"_start",
|
|
1005
|
+
"_current",
|
|
1006
|
+
"_line",
|
|
1007
|
+
"_col",
|
|
1008
|
+
"_comments",
|
|
1009
|
+
"_char",
|
|
1010
|
+
"_end",
|
|
1011
|
+
"_peek",
|
|
1012
|
+
"_prev_token_line",
|
|
1013
|
+
"_rs_dialect_settings",
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
def __init__(
|
|
1017
|
+
self, dialect: DialectType = None, use_rs_tokenizer: t.Optional[bool] = None
|
|
1018
|
+
) -> None:
|
|
1019
|
+
from sqlglot.dialects import Dialect
|
|
1020
|
+
|
|
1021
|
+
self.dialect = Dialect.get_or_raise(dialect)
|
|
1022
|
+
|
|
1023
|
+
# initialize `use_rs_tokenizer`, and allow it to be overwritten per Tokenizer instance
|
|
1024
|
+
self.use_rs_tokenizer = (
|
|
1025
|
+
use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
if self.use_rs_tokenizer:
|
|
1029
|
+
self._rs_dialect_settings = RsTokenizerDialectSettings(
|
|
1030
|
+
unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
|
|
1031
|
+
identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
|
|
1032
|
+
numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
self.reset()
|
|
1036
|
+
|
|
1037
|
+
def reset(self) -> None:
|
|
1038
|
+
self.sql = ""
|
|
1039
|
+
self.size = 0
|
|
1040
|
+
self.tokens: t.List[Token] = []
|
|
1041
|
+
self._start = 0
|
|
1042
|
+
self._current = 0
|
|
1043
|
+
self._line = 1
|
|
1044
|
+
self._col = 0
|
|
1045
|
+
self._comments: t.List[str] = []
|
|
1046
|
+
|
|
1047
|
+
self._char = ""
|
|
1048
|
+
self._end = False
|
|
1049
|
+
self._peek = ""
|
|
1050
|
+
self._prev_token_line = -1
|
|
1051
|
+
|
|
1052
|
+
def tokenize(self, sql: str) -> t.List[Token]:
|
|
1053
|
+
"""Returns a list of tokens corresponding to the SQL string `sql`."""
|
|
1054
|
+
if self.use_rs_tokenizer:
|
|
1055
|
+
return self.tokenize_rs(sql)
|
|
1056
|
+
|
|
1057
|
+
self.reset()
|
|
1058
|
+
self.sql = sql
|
|
1059
|
+
self.size = len(sql)
|
|
1060
|
+
|
|
1061
|
+
try:
|
|
1062
|
+
self._scan()
|
|
1063
|
+
except Exception as e:
|
|
1064
|
+
start = max(self._current - 50, 0)
|
|
1065
|
+
end = min(self._current + 50, self.size - 1)
|
|
1066
|
+
context = self.sql[start:end]
|
|
1067
|
+
raise TokenError(f"Error tokenizing '{context}'") from e
|
|
1068
|
+
|
|
1069
|
+
return self.tokens
|
|
1070
|
+
|
|
1071
|
+
def _scan(self, until: t.Optional[t.Callable] = None) -> None:
|
|
1072
|
+
while self.size and not self._end:
|
|
1073
|
+
current = self._current
|
|
1074
|
+
|
|
1075
|
+
# Skip spaces here rather than iteratively calling advance() for performance reasons
|
|
1076
|
+
while current < self.size:
|
|
1077
|
+
char = self.sql[current]
|
|
1078
|
+
|
|
1079
|
+
if char.isspace() and (char == " " or char == "\t"):
|
|
1080
|
+
current += 1
|
|
1081
|
+
else:
|
|
1082
|
+
break
|
|
1083
|
+
|
|
1084
|
+
offset = current - self._current if current > self._current else 1
|
|
1085
|
+
|
|
1086
|
+
self._start = current
|
|
1087
|
+
self._advance(offset)
|
|
1088
|
+
|
|
1089
|
+
if not self._char.isspace():
|
|
1090
|
+
if self._char.isdigit():
|
|
1091
|
+
self._scan_number()
|
|
1092
|
+
elif self._char in self._IDENTIFIERS:
|
|
1093
|
+
self._scan_identifier(self._IDENTIFIERS[self._char])
|
|
1094
|
+
else:
|
|
1095
|
+
self._scan_keywords()
|
|
1096
|
+
|
|
1097
|
+
if until and until():
|
|
1098
|
+
break
|
|
1099
|
+
|
|
1100
|
+
if self.tokens and self._comments:
|
|
1101
|
+
self.tokens[-1].comments.extend(self._comments)
|
|
1102
|
+
|
|
1103
|
+
def _chars(self, size: int) -> str:
|
|
1104
|
+
if size == 1:
|
|
1105
|
+
return self._char
|
|
1106
|
+
|
|
1107
|
+
start = self._current - 1
|
|
1108
|
+
end = start + size
|
|
1109
|
+
|
|
1110
|
+
return self.sql[start:end] if end <= self.size else ""
|
|
1111
|
+
|
|
1112
|
+
def _advance(self, i: int = 1, alnum: bool = False) -> None:
|
|
1113
|
+
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
|
|
1114
|
+
# Ensures we don't count an extra line if we get a \r\n line break sequence
|
|
1115
|
+
if not (self._char == "\r" and self._peek == "\n"):
|
|
1116
|
+
self._col = i
|
|
1117
|
+
self._line += 1
|
|
1118
|
+
else:
|
|
1119
|
+
self._col += i
|
|
1120
|
+
|
|
1121
|
+
self._current += i
|
|
1122
|
+
self._end = self._current >= self.size
|
|
1123
|
+
self._char = self.sql[self._current - 1]
|
|
1124
|
+
self._peek = "" if self._end else self.sql[self._current]
|
|
1125
|
+
|
|
1126
|
+
if alnum and self._char.isalnum():
|
|
1127
|
+
# Here we use local variables instead of attributes for better performance
|
|
1128
|
+
_col = self._col
|
|
1129
|
+
_current = self._current
|
|
1130
|
+
_end = self._end
|
|
1131
|
+
_peek = self._peek
|
|
1132
|
+
|
|
1133
|
+
while _peek.isalnum():
|
|
1134
|
+
_col += 1
|
|
1135
|
+
_current += 1
|
|
1136
|
+
_end = _current >= self.size
|
|
1137
|
+
_peek = "" if _end else self.sql[_current]
|
|
1138
|
+
|
|
1139
|
+
self._col = _col
|
|
1140
|
+
self._current = _current
|
|
1141
|
+
self._end = _end
|
|
1142
|
+
self._peek = _peek
|
|
1143
|
+
self._char = self.sql[_current - 1]
|
|
1144
|
+
|
|
1145
|
+
@property
|
|
1146
|
+
def _text(self) -> str:
|
|
1147
|
+
return self.sql[self._start : self._current]
|
|
1148
|
+
|
|
1149
|
+
def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
|
|
1150
|
+
self._prev_token_line = self._line
|
|
1151
|
+
|
|
1152
|
+
if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
|
|
1153
|
+
self.tokens[-1].comments.extend(self._comments)
|
|
1154
|
+
self._comments = []
|
|
1155
|
+
|
|
1156
|
+
self.tokens.append(
|
|
1157
|
+
Token(
|
|
1158
|
+
token_type,
|
|
1159
|
+
text=self._text if text is None else text,
|
|
1160
|
+
line=self._line,
|
|
1161
|
+
col=self._col,
|
|
1162
|
+
start=self._start,
|
|
1163
|
+
end=self._current - 1,
|
|
1164
|
+
comments=self._comments,
|
|
1165
|
+
)
|
|
1166
|
+
)
|
|
1167
|
+
self._comments = []
|
|
1168
|
+
|
|
1169
|
+
# If we have either a semicolon or a begin token before the command's token, we'll parse
|
|
1170
|
+
# whatever follows the command's token as a string
|
|
1171
|
+
if (
|
|
1172
|
+
token_type in self.COMMANDS
|
|
1173
|
+
and self._peek != ";"
|
|
1174
|
+
and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
|
|
1175
|
+
):
|
|
1176
|
+
start = self._current
|
|
1177
|
+
tokens = len(self.tokens)
|
|
1178
|
+
self._scan(lambda: self._peek == ";")
|
|
1179
|
+
self.tokens = self.tokens[:tokens]
|
|
1180
|
+
text = self.sql[start : self._current].strip()
|
|
1181
|
+
if text:
|
|
1182
|
+
self._add(TokenType.STRING, text)
|
|
1183
|
+
|
|
1184
|
+
def _scan_keywords(self) -> None:
|
|
1185
|
+
size = 0
|
|
1186
|
+
word = None
|
|
1187
|
+
chars = self._text
|
|
1188
|
+
char = chars
|
|
1189
|
+
prev_space = False
|
|
1190
|
+
skip = False
|
|
1191
|
+
trie = self._KEYWORD_TRIE
|
|
1192
|
+
single_token = char in self.SINGLE_TOKENS
|
|
1193
|
+
|
|
1194
|
+
while chars:
|
|
1195
|
+
if skip:
|
|
1196
|
+
result = TrieResult.PREFIX
|
|
1197
|
+
else:
|
|
1198
|
+
result, trie = in_trie(trie, char.upper())
|
|
1199
|
+
|
|
1200
|
+
if result == TrieResult.FAILED:
|
|
1201
|
+
break
|
|
1202
|
+
if result == TrieResult.EXISTS:
|
|
1203
|
+
word = chars
|
|
1204
|
+
|
|
1205
|
+
end = self._current + size
|
|
1206
|
+
size += 1
|
|
1207
|
+
|
|
1208
|
+
if end < self.size:
|
|
1209
|
+
char = self.sql[end]
|
|
1210
|
+
single_token = single_token or char in self.SINGLE_TOKENS
|
|
1211
|
+
is_space = char.isspace()
|
|
1212
|
+
|
|
1213
|
+
if not is_space or not prev_space:
|
|
1214
|
+
if is_space:
|
|
1215
|
+
char = " "
|
|
1216
|
+
chars += char
|
|
1217
|
+
prev_space = is_space
|
|
1218
|
+
skip = False
|
|
1219
|
+
else:
|
|
1220
|
+
skip = True
|
|
1221
|
+
else:
|
|
1222
|
+
char = ""
|
|
1223
|
+
break
|
|
1224
|
+
|
|
1225
|
+
if word:
|
|
1226
|
+
if self._scan_string(word):
|
|
1227
|
+
return
|
|
1228
|
+
if self._scan_comment(word):
|
|
1229
|
+
return
|
|
1230
|
+
if prev_space or single_token or not char:
|
|
1231
|
+
self._advance(size - 1)
|
|
1232
|
+
word = word.upper()
|
|
1233
|
+
self._add(self.KEYWORDS[word], text=word)
|
|
1234
|
+
return
|
|
1235
|
+
|
|
1236
|
+
if self._char in self.SINGLE_TOKENS:
|
|
1237
|
+
self._add(self.SINGLE_TOKENS[self._char], text=self._char)
|
|
1238
|
+
return
|
|
1239
|
+
|
|
1240
|
+
self._scan_var()
|
|
1241
|
+
|
|
1242
|
+
def _scan_comment(self, comment_start: str) -> bool:
|
|
1243
|
+
if comment_start not in self._COMMENTS:
|
|
1244
|
+
return False
|
|
1245
|
+
|
|
1246
|
+
comment_start_line = self._line
|
|
1247
|
+
comment_start_size = len(comment_start)
|
|
1248
|
+
comment_end = self._COMMENTS[comment_start]
|
|
1249
|
+
|
|
1250
|
+
if comment_end:
|
|
1251
|
+
# Skip the comment's start delimiter
|
|
1252
|
+
self._advance(comment_start_size)
|
|
1253
|
+
|
|
1254
|
+
comment_count = 1
|
|
1255
|
+
comment_end_size = len(comment_end)
|
|
1256
|
+
|
|
1257
|
+
while not self._end:
|
|
1258
|
+
if self._chars(comment_end_size) == comment_end:
|
|
1259
|
+
comment_count -= 1
|
|
1260
|
+
if not comment_count:
|
|
1261
|
+
break
|
|
1262
|
+
|
|
1263
|
+
self._advance(alnum=True)
|
|
1264
|
+
|
|
1265
|
+
# Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
|
|
1266
|
+
if (
|
|
1267
|
+
self.NESTED_COMMENTS
|
|
1268
|
+
and not self._end
|
|
1269
|
+
and self._chars(comment_end_size) == comment_start
|
|
1270
|
+
):
|
|
1271
|
+
self._advance(comment_start_size)
|
|
1272
|
+
comment_count += 1
|
|
1273
|
+
|
|
1274
|
+
self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
|
|
1275
|
+
self._advance(comment_end_size - 1)
|
|
1276
|
+
else:
|
|
1277
|
+
while not self._end and self.WHITE_SPACE.get(self._peek) is not TokenType.BREAK:
|
|
1278
|
+
self._advance(alnum=True)
|
|
1279
|
+
self._comments.append(self._text[comment_start_size:])
|
|
1280
|
+
|
|
1281
|
+
if (
|
|
1282
|
+
comment_start == self.HINT_START
|
|
1283
|
+
and self.tokens
|
|
1284
|
+
and self.tokens[-1].token_type in self.TOKENS_PRECEDING_HINT
|
|
1285
|
+
):
|
|
1286
|
+
self._add(TokenType.HINT)
|
|
1287
|
+
|
|
1288
|
+
# Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
|
|
1289
|
+
# Multiple consecutive comments are preserved by appending them to the current comments list.
|
|
1290
|
+
if comment_start_line == self._prev_token_line:
|
|
1291
|
+
self.tokens[-1].comments.extend(self._comments)
|
|
1292
|
+
self._comments = []
|
|
1293
|
+
self._prev_token_line = self._line
|
|
1294
|
+
|
|
1295
|
+
return True
|
|
1296
|
+
|
|
1297
|
+
def _scan_number(self) -> None:
|
|
1298
|
+
if self._char == "0":
|
|
1299
|
+
peek = self._peek.upper()
|
|
1300
|
+
if peek == "B":
|
|
1301
|
+
return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
|
|
1302
|
+
elif peek == "X":
|
|
1303
|
+
return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
|
|
1304
|
+
|
|
1305
|
+
decimal = False
|
|
1306
|
+
scientific = 0
|
|
1307
|
+
|
|
1308
|
+
while True:
|
|
1309
|
+
if self._peek.isdigit():
|
|
1310
|
+
self._advance()
|
|
1311
|
+
elif self._peek == "." and not decimal:
|
|
1312
|
+
if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER:
|
|
1313
|
+
return self._add(TokenType.NUMBER)
|
|
1314
|
+
decimal = True
|
|
1315
|
+
self._advance()
|
|
1316
|
+
elif self._peek in ("-", "+") and scientific == 1:
|
|
1317
|
+
scientific += 1
|
|
1318
|
+
self._advance()
|
|
1319
|
+
elif self._peek.upper() == "E" and not scientific:
|
|
1320
|
+
scientific += 1
|
|
1321
|
+
self._advance()
|
|
1322
|
+
elif self._peek.isidentifier():
|
|
1323
|
+
number_text = self._text
|
|
1324
|
+
literal = ""
|
|
1325
|
+
|
|
1326
|
+
while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
|
|
1327
|
+
literal += self._peek
|
|
1328
|
+
self._advance()
|
|
1329
|
+
|
|
1330
|
+
token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), ""))
|
|
1331
|
+
|
|
1332
|
+
if token_type:
|
|
1333
|
+
self._add(TokenType.NUMBER, number_text)
|
|
1334
|
+
self._add(TokenType.DCOLON, "::")
|
|
1335
|
+
return self._add(token_type, literal)
|
|
1336
|
+
else:
|
|
1337
|
+
replaced = literal.replace("_", "")
|
|
1338
|
+
if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit():
|
|
1339
|
+
return self._add(TokenType.NUMBER, number_text + replaced)
|
|
1340
|
+
if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
|
|
1341
|
+
return self._add(TokenType.VAR)
|
|
1342
|
+
|
|
1343
|
+
self._advance(-len(literal))
|
|
1344
|
+
return self._add(TokenType.NUMBER, number_text)
|
|
1345
|
+
else:
|
|
1346
|
+
return self._add(TokenType.NUMBER)
|
|
1347
|
+
|
|
1348
|
+
def _scan_bits(self) -> None:
|
|
1349
|
+
self._advance()
|
|
1350
|
+
value = self._extract_value()
|
|
1351
|
+
try:
|
|
1352
|
+
# If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
|
|
1353
|
+
int(value, 2)
|
|
1354
|
+
self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b
|
|
1355
|
+
except ValueError:
|
|
1356
|
+
self._add(TokenType.IDENTIFIER)
|
|
1357
|
+
|
|
1358
|
+
def _scan_hex(self) -> None:
|
|
1359
|
+
self._advance()
|
|
1360
|
+
value = self._extract_value()
|
|
1361
|
+
try:
|
|
1362
|
+
# If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
|
|
1363
|
+
int(value, 16)
|
|
1364
|
+
self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x
|
|
1365
|
+
except ValueError:
|
|
1366
|
+
self._add(TokenType.IDENTIFIER)
|
|
1367
|
+
|
|
1368
|
+
def _extract_value(self) -> str:
|
|
1369
|
+
while True:
|
|
1370
|
+
char = self._peek.strip()
|
|
1371
|
+
if char and char not in self.SINGLE_TOKENS:
|
|
1372
|
+
self._advance(alnum=True)
|
|
1373
|
+
else:
|
|
1374
|
+
break
|
|
1375
|
+
|
|
1376
|
+
return self._text
|
|
1377
|
+
|
|
1378
|
+
def _scan_string(self, start: str) -> bool:
|
|
1379
|
+
base = None
|
|
1380
|
+
token_type = TokenType.STRING
|
|
1381
|
+
|
|
1382
|
+
if start in self._QUOTES:
|
|
1383
|
+
end = self._QUOTES[start]
|
|
1384
|
+
elif start in self._FORMAT_STRINGS:
|
|
1385
|
+
end, token_type = self._FORMAT_STRINGS[start]
|
|
1386
|
+
|
|
1387
|
+
if token_type == TokenType.HEX_STRING:
|
|
1388
|
+
base = 16
|
|
1389
|
+
elif token_type == TokenType.BIT_STRING:
|
|
1390
|
+
base = 2
|
|
1391
|
+
elif token_type == TokenType.HEREDOC_STRING:
|
|
1392
|
+
self._advance()
|
|
1393
|
+
|
|
1394
|
+
if self._char == end:
|
|
1395
|
+
tag = ""
|
|
1396
|
+
else:
|
|
1397
|
+
tag = self._extract_string(
|
|
1398
|
+
end,
|
|
1399
|
+
raw_string=True,
|
|
1400
|
+
raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()):
|
|
1404
|
+
if not self._end:
|
|
1405
|
+
self._advance(-1)
|
|
1406
|
+
|
|
1407
|
+
self._advance(-len(tag))
|
|
1408
|
+
self._add(self.HEREDOC_STRING_ALTERNATIVE)
|
|
1409
|
+
return True
|
|
1410
|
+
|
|
1411
|
+
end = f"{start}{tag}{end}"
|
|
1412
|
+
else:
|
|
1413
|
+
return False
|
|
1414
|
+
|
|
1415
|
+
self._advance(len(start))
|
|
1416
|
+
text = self._extract_string(end, raw_string=token_type == TokenType.RAW_STRING)
|
|
1417
|
+
|
|
1418
|
+
if base:
|
|
1419
|
+
try:
|
|
1420
|
+
int(text, base)
|
|
1421
|
+
except Exception:
|
|
1422
|
+
raise TokenError(
|
|
1423
|
+
f"Numeric string contains invalid characters from {self._line}:{self._start}"
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
self._add(token_type, text)
|
|
1427
|
+
return True
|
|
1428
|
+
|
|
1429
|
+
def _scan_identifier(self, identifier_end: str) -> None:
|
|
1430
|
+
self._advance()
|
|
1431
|
+
text = self._extract_string(
|
|
1432
|
+
identifier_end, escapes=self._IDENTIFIER_ESCAPES | {identifier_end}
|
|
1433
|
+
)
|
|
1434
|
+
self._add(TokenType.IDENTIFIER, text)
|
|
1435
|
+
|
|
1436
|
+
def _scan_var(self) -> None:
|
|
1437
|
+
while True:
|
|
1438
|
+
char = self._peek.strip()
|
|
1439
|
+
if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
|
|
1440
|
+
self._advance(alnum=True)
|
|
1441
|
+
else:
|
|
1442
|
+
break
|
|
1443
|
+
|
|
1444
|
+
self._add(
|
|
1445
|
+
TokenType.VAR
|
|
1446
|
+
if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
|
|
1447
|
+
else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
def _extract_string(
|
|
1451
|
+
self,
|
|
1452
|
+
delimiter: str,
|
|
1453
|
+
escapes: t.Optional[t.Set[str]] = None,
|
|
1454
|
+
raw_string: bool = False,
|
|
1455
|
+
raise_unmatched: bool = True,
|
|
1456
|
+
) -> str:
|
|
1457
|
+
text = ""
|
|
1458
|
+
delim_size = len(delimiter)
|
|
1459
|
+
escapes = self._STRING_ESCAPES if escapes is None else escapes
|
|
1460
|
+
|
|
1461
|
+
while True:
|
|
1462
|
+
if (
|
|
1463
|
+
not raw_string
|
|
1464
|
+
and self.dialect.UNESCAPED_SEQUENCES
|
|
1465
|
+
and self._peek
|
|
1466
|
+
and self._char in self.STRING_ESCAPES
|
|
1467
|
+
):
|
|
1468
|
+
unescaped_sequence = self.dialect.UNESCAPED_SEQUENCES.get(self._char + self._peek)
|
|
1469
|
+
if unescaped_sequence:
|
|
1470
|
+
self._advance(2)
|
|
1471
|
+
text += unescaped_sequence
|
|
1472
|
+
continue
|
|
1473
|
+
if (
|
|
1474
|
+
(self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS or not raw_string)
|
|
1475
|
+
and self._char in escapes
|
|
1476
|
+
and (self._peek == delimiter or self._peek in escapes)
|
|
1477
|
+
and (self._char not in self._QUOTES or self._char == self._peek)
|
|
1478
|
+
):
|
|
1479
|
+
if self._peek == delimiter:
|
|
1480
|
+
text += self._peek
|
|
1481
|
+
else:
|
|
1482
|
+
text += self._char + self._peek
|
|
1483
|
+
|
|
1484
|
+
if self._current + 1 < self.size:
|
|
1485
|
+
self._advance(2)
|
|
1486
|
+
else:
|
|
1487
|
+
raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
|
|
1488
|
+
else:
|
|
1489
|
+
if self._chars(delim_size) == delimiter:
|
|
1490
|
+
if delim_size > 1:
|
|
1491
|
+
self._advance(delim_size - 1)
|
|
1492
|
+
break
|
|
1493
|
+
|
|
1494
|
+
if self._end:
|
|
1495
|
+
if not raise_unmatched:
|
|
1496
|
+
return text + self._char
|
|
1497
|
+
|
|
1498
|
+
raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
|
|
1499
|
+
|
|
1500
|
+
current = self._current - 1
|
|
1501
|
+
self._advance(alnum=True)
|
|
1502
|
+
text += self.sql[current : self._current - 1]
|
|
1503
|
+
|
|
1504
|
+
return text
|
|
1505
|
+
|
|
1506
|
+
def tokenize_rs(self, sql: str) -> t.List[Token]:
|
|
1507
|
+
if not self._RS_TOKENIZER:
|
|
1508
|
+
raise SqlglotError("Rust tokenizer is not available")
|
|
1509
|
+
|
|
1510
|
+
tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
|
|
1511
|
+
for token in tokens:
|
|
1512
|
+
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
|
|
1513
|
+
|
|
1514
|
+
# Setting this here so partial token lists can be inspected even if there is a failure
|
|
1515
|
+
self.tokens = tokens
|
|
1516
|
+
|
|
1517
|
+
if error_msg is not None:
|
|
1518
|
+
raise TokenError(error_msg)
|
|
1519
|
+
|
|
1520
|
+
return tokens
|