PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/qal/parser.py
ADDED
|
@@ -0,0 +1,2344 @@
|
|
|
1
|
+
"""The parser constructs `SqlQuery` objects from query strings.
|
|
2
|
+
|
|
3
|
+
Other than the parsing itself, the process will also execute a basic column binding process. For example, consider
|
|
4
|
+
a query like *SELECT \\* FROM R WHERE R.a = 42*. In this case, the binding only affects the column reference *R.a*
|
|
5
|
+
and sets the table of that column to *R*. This binding based on column and table names is always performed.
|
|
6
|
+
|
|
7
|
+
If the table cannot be inferred based on the column name (e.g. for a query like *SELECT * FROM R, S WHERE a = 42*), a
|
|
8
|
+
second binding phase can be executed. This binding needs a working database connection and queries the database schema
|
|
9
|
+
to detect the correct tables for each column. Whether the second phase should also be executed by default can be
|
|
10
|
+
configured system-wide by setting the `auto_bind_columns` variable.
|
|
11
|
+
|
|
12
|
+
Notes
|
|
13
|
+
-----
|
|
14
|
+
Please beware that SQL parsing is a very challenging undertaking and there might be bugs in some lesser-used features.
|
|
15
|
+
If you encounter any issues, please report them on the GitHub issue tracker.
|
|
16
|
+
We test the parser based on some popular benchmarks, namely JOB and Stats to ensure that result sets from the raw SQL queries
|
|
17
|
+
match result sets from the parsed queries. However, we cannot guarantee that the parser will work for all SQL queries.
|
|
18
|
+
|
|
19
|
+
The parsing itself is based on the pglast project that implements a SQL -> JSON/dict conversion, based on the actual Postgres
|
|
20
|
+
query parser. Our parser implementation takes such a JSON representation as input and generates the more verbose structures of
|
|
21
|
+
the qal. There exists a Jupyter notebook called *PglastParsingTests* in the *tests* directory that shows the output emitted by
|
|
22
|
+
pglast for different SQL query features.
|
|
23
|
+
|
|
24
|
+
References
|
|
25
|
+
----------
|
|
26
|
+
|
|
27
|
+
.. pglast project: https://github.com/lelit/pglast Thanks a lot for maintaining this fantastic tool and the great support!
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import collections
|
|
33
|
+
import json
|
|
34
|
+
import warnings
|
|
35
|
+
from collections.abc import Iterable
|
|
36
|
+
from typing import Literal, Optional, overload
|
|
37
|
+
|
|
38
|
+
import pglast
|
|
39
|
+
|
|
40
|
+
from .. import util
|
|
41
|
+
from .._core import ColumnReference, DBCatalog, TableReference
|
|
42
|
+
from ._qal import (
|
|
43
|
+
AbstractPredicate,
|
|
44
|
+
ArrayAccessExpression,
|
|
45
|
+
BaseClause,
|
|
46
|
+
BaseProjection,
|
|
47
|
+
BetweenPredicate,
|
|
48
|
+
BinaryPredicate,
|
|
49
|
+
CaseExpression,
|
|
50
|
+
CastExpression,
|
|
51
|
+
ColumnExpression,
|
|
52
|
+
CommonTableExpression,
|
|
53
|
+
CompoundOperator,
|
|
54
|
+
CompoundPredicate,
|
|
55
|
+
DirectTableSource,
|
|
56
|
+
Explain,
|
|
57
|
+
ExplicitFromClause,
|
|
58
|
+
From,
|
|
59
|
+
FunctionExpression,
|
|
60
|
+
FunctionTableSource,
|
|
61
|
+
GroupBy,
|
|
62
|
+
Having,
|
|
63
|
+
Hint,
|
|
64
|
+
ImplicitFromClause,
|
|
65
|
+
InPredicate,
|
|
66
|
+
JoinTableSource,
|
|
67
|
+
JoinType,
|
|
68
|
+
Limit,
|
|
69
|
+
LogicalOperator,
|
|
70
|
+
MathExpression,
|
|
71
|
+
MathOperator,
|
|
72
|
+
OrderBy,
|
|
73
|
+
OrderByExpression,
|
|
74
|
+
Select,
|
|
75
|
+
SelectStatement,
|
|
76
|
+
SetOperator,
|
|
77
|
+
SetQuery,
|
|
78
|
+
SqlExpression,
|
|
79
|
+
SqlOperator,
|
|
80
|
+
SqlQuery,
|
|
81
|
+
StarExpression,
|
|
82
|
+
StaticValueExpression,
|
|
83
|
+
SubqueryExpression,
|
|
84
|
+
SubqueryTableSource,
|
|
85
|
+
TableSource,
|
|
86
|
+
UnaryPredicate,
|
|
87
|
+
ValuesList,
|
|
88
|
+
ValuesTableSource,
|
|
89
|
+
ValuesWithQuery,
|
|
90
|
+
Where,
|
|
91
|
+
WindowExpression,
|
|
92
|
+
WithQuery,
|
|
93
|
+
build_query,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
auto_bind_columns: bool = True
|
|
97
|
+
"""Indicates whether the parser should use the database catalog to obtain column bindings."""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class SchemaCache:
|
|
101
|
+
"""A simple cache that stores the columns that belong to tables in our database schema.
|
|
102
|
+
|
|
103
|
+
The cache only queries the actual catalog of the database system, if the requested table has not been cached, yet.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
schema : Optional[DatabaseSchema]
|
|
108
|
+
The schema to cache. If not provided, the cache cannot resolve column bindings.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self, schema: Optional[DBCatalog] = None) -> None:
|
|
112
|
+
self._schema = schema
|
|
113
|
+
self._lookup_cache: dict[TableReference, tuple[list[str], set[str]]] = (
|
|
114
|
+
collections.defaultdict(set)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def initialize_with(self, schema: Optional[DBCatalog]) -> None:
|
|
118
|
+
"""Sets the catalog if necessary"""
|
|
119
|
+
if self._schema is not None and self._schema != schema:
|
|
120
|
+
warnings.warn("Parsing query for new schema. Dropping old schema cache.")
|
|
121
|
+
self._schema = schema
|
|
122
|
+
self._lookup_cache.clear()
|
|
123
|
+
elif self._schema is not None:
|
|
124
|
+
# same schema as before, do nothing
|
|
125
|
+
return
|
|
126
|
+
self._schema = schema
|
|
127
|
+
|
|
128
|
+
def lookup_column(
|
|
129
|
+
self, colname: str, candidate_tables: Iterable[TableReference]
|
|
130
|
+
) -> Optional[TableReference]:
|
|
131
|
+
"""Resolves the table that defines a specific column.
|
|
132
|
+
|
|
133
|
+
If no catalog is available, this method will always return *None*.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
Optional[TableReference]
|
|
138
|
+
The table that defines the column. If there are multiple tables that could define the column, an arbitrary one
|
|
139
|
+
is returned. If none of the candidates is the correct table, *None* is returned.
|
|
140
|
+
"""
|
|
141
|
+
if not self._schema:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
for candidate in candidate_tables:
|
|
145
|
+
if candidate.virtual:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
_, table_columns = self._inflate_cache(candidate)
|
|
149
|
+
if colname in table_columns:
|
|
150
|
+
return candidate
|
|
151
|
+
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
def columns_of(self, table: str) -> list[str]:
|
|
155
|
+
"""Provides the columns that belong to a specific table.
|
|
156
|
+
|
|
157
|
+
If no catalog is available, this method will always return an empty list.
|
|
158
|
+
"""
|
|
159
|
+
if not self._schema:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
cols, _ = self._inflate_cache(table)
|
|
163
|
+
return cols
|
|
164
|
+
|
|
165
|
+
def _inflate_cache(self, table: str) -> tuple[list[str], set[str]]:
|
|
166
|
+
"""Provides the columns that belong to a specific table, consulting the online catalog if necessary.
|
|
167
|
+
|
|
168
|
+
This method assumes that there is indeed an online schema available. Calling this method without a schema will
|
|
169
|
+
result in an arbitrary runtime error.
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
tuple[list[str], set[str]]
|
|
174
|
+
The columns of the table in their defined order, as well as the same columns as a set.
|
|
175
|
+
"""
|
|
176
|
+
cached_res = self._lookup_cache.get(table)
|
|
177
|
+
if cached_res:
|
|
178
|
+
return cached_res
|
|
179
|
+
cols: list[str] = [col.name for col in self._schema.columns(table)]
|
|
180
|
+
cols_set = set(cols)
|
|
181
|
+
self._lookup_cache[table] = cols, cols_set
|
|
182
|
+
return cols, cols_set
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class QueryNamespace:
|
|
186
|
+
"""The query namespace acts as the central service to resolve column bindings in a query.
|
|
187
|
+
|
|
188
|
+
It maintains a visibility map of all tables at a given point in the query and keeps track of the columns that form the
|
|
189
|
+
result relation at the same points in time. This information is used to bind column references to the correct tables,
|
|
190
|
+
including temporary virtual tables that alias existing physical columns.
|
|
191
|
+
|
|
192
|
+
The namespace protocol works as follows:
|
|
193
|
+
|
|
194
|
+
- While parsing a query, the table sources (CTEs and FROM entries) should be handled first. Each source should be
|
|
195
|
+
registered in the namespace using the `register_table` method.
|
|
196
|
+
- When a subquery or CTE is encoutered, the `open_nested` method has to be called to open a new local namespace and
|
|
197
|
+
track the virtual table correctly.
|
|
198
|
+
- Once all tables are registered, the parser can handle the *SELECT* clause. Afterwards, `determine_output_shape` has to
|
|
199
|
+
called to compute all columns that are part of the result relation of the current namespace. This method takes care
|
|
200
|
+
of resolving *SELECT \\** operations as necessary and requires that all input sources have already been registered and
|
|
201
|
+
completely parsed, such that their output shapes are known.
|
|
202
|
+
- While parsing the different clauses of the query, `lookup_column` and `resolve_table` can be used to determine the
|
|
203
|
+
correct table references based on the sources that are currently available in the namespace.
|
|
204
|
+
|
|
205
|
+
Each namespace can be connected to a parent namespace, which in turn can provide additional CTEs, physical tables or
|
|
206
|
+
subqueries (if the current namespace is for a LATERAL subquery). This allows the current namespace to check whether some
|
|
207
|
+
column is actually provided by an outer scope if the namespace does not provide the column itself.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
_schema_cache: SchemaCache = SchemaCache()
|
|
211
|
+
"""The schema cache that is used to resolve column bindings. This cache is shared through the entire program lifetime.
|
|
212
|
+
|
|
213
|
+
Changing the actual database schema while PostBOUND is running will result in undefined behavior.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
@staticmethod
|
|
217
|
+
def empty(schema: Optional["DatabaseSchema"] = None) -> QueryNamespace: # type: ignore # noqa: F821
|
|
218
|
+
QueryNamespace._schema_cache.initialize_with(schema)
|
|
219
|
+
return QueryNamespace()
|
|
220
|
+
|
|
221
|
+
def __init__(self, *, parent: Optional[QueryNamespace] = None) -> None:
|
|
222
|
+
self._parent = parent
|
|
223
|
+
|
|
224
|
+
self._subquery_children: dict[str, QueryNamespace] = {}
|
|
225
|
+
"""Nested namespaces that are provided as part of subqueries. Entries map alias -> query."""
|
|
226
|
+
|
|
227
|
+
self._setop_children: list[QueryNamespace, QueryNamespace] = []
|
|
228
|
+
"""Namespace of the queries that form a set operation in the current namespace."""
|
|
229
|
+
|
|
230
|
+
self._current_ctx: list[TableReference] = []
|
|
231
|
+
"""The tables that are currently in scope, no matter their origin (CTEs or FROM clause).
|
|
232
|
+
|
|
233
|
+
For the purpose of this dictionary, it does not matter where a table comes from (physical table, CTE, subquery, ...).
|
|
234
|
+
The only thing that matters is that the table is part of the FROM clause. This is especially important to build the
|
|
235
|
+
correct output shape of the namespace/relation if the SELECT clause contains * expressions.
|
|
236
|
+
|
|
237
|
+
Therefore, there might be tables that are contained in the `_cte_sources`, but not here (if the CTE is only used to
|
|
238
|
+
build other CTEs, but not part of the FROM clause itself).
|
|
239
|
+
|
|
240
|
+
Notice that tables that are defined in an enclosing scope (e.g. outer query in a sequence of nested CTEs) are not
|
|
241
|
+
contained in this context if they are not also part of this namespace's *FROM* clause. Instead, they are resolved
|
|
242
|
+
through the API on the parent namespace.
|
|
243
|
+
|
|
244
|
+
The ordering is important to resolve output columns correctly to the first match as Postgres does.
|
|
245
|
+
|
|
246
|
+
An optimized container to check whether a table is part of the current context is available via `_table_sources`
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
self._cte_sources: dict[str, QueryNamespace] = {}
|
|
250
|
+
"""Namespaces that are induced by CTEs. Entries map alias -> CTE."""
|
|
251
|
+
|
|
252
|
+
self._table_sources: dict[str, TableReference] = {}
|
|
253
|
+
"""The tables that are part of the FROM clause of the query. Entries map alias -> table.
|
|
254
|
+
|
|
255
|
+
Tables can be contained in this dictionary multiple times: once for each relevant identifier. If a table has both an
|
|
256
|
+
alias as well as a full name, both keys will be present.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
self._output_shape: list[str] = []
|
|
260
|
+
"""The column names that are part of the result set produced by the queries in this namespace.
|
|
261
|
+
|
|
262
|
+
These are really just the column names, not full references. This is because it makes the access mechanism more
|
|
263
|
+
transparent (just use the name, duh) and prevents accidental issues when a column from an inner query is re-used in an
|
|
264
|
+
outer query and is bound to both virtual tables. Comparing the references would indicate that these are different
|
|
265
|
+
columns (which arguably they are, just not for our purposes).
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
self._column_cache: dict[str, TableReference] = {}
|
|
269
|
+
"""A cache to resolve common columns in the current context more quickly."""
|
|
270
|
+
|
|
271
|
+
def determine_output_shape(
|
|
272
|
+
self, select_clause: Optional[Select | Iterable[ColumnReference | str]]
|
|
273
|
+
) -> None:
|
|
274
|
+
"""Determines the columns that form the result relation of this namespace.
|
|
275
|
+
|
|
276
|
+
The result is only stored internally to allow parent namespaces to resolve column references correctly.
|
|
277
|
+
|
|
278
|
+
This method should only be called after all table sources from the current namespace are already registered in order to
|
|
279
|
+
ensure that star expressions can be resolved correctly.
|
|
280
|
+
"""
|
|
281
|
+
self._output_shape = []
|
|
282
|
+
if self._setop_children:
|
|
283
|
+
# We use Postgre's rules here: the output relation of a set operation contains exactly those columns that are
|
|
284
|
+
# contained in the LHS relation
|
|
285
|
+
self._output_shape = list(self._setop_children[0]._output_shape)
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
for projection in select_clause:
|
|
289
|
+
if isinstance(projection, (str, ColumnReference)):
|
|
290
|
+
self._output_shape.append(
|
|
291
|
+
projection.name
|
|
292
|
+
if isinstance(projection, ColumnReference)
|
|
293
|
+
else projection
|
|
294
|
+
)
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
# must be BaseProjection
|
|
298
|
+
if projection.target_name:
|
|
299
|
+
self._output_shape.append(projection.target_name)
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
match projection.expression:
|
|
303
|
+
case ColumnExpression(col):
|
|
304
|
+
self._output_shape.append(col.name)
|
|
305
|
+
|
|
306
|
+
case StarExpression(from_table):
|
|
307
|
+
ctx = {from_table} if from_table else self._current_ctx
|
|
308
|
+
for table in ctx:
|
|
309
|
+
if not table.virtual:
|
|
310
|
+
self._output_shape.extend(
|
|
311
|
+
self._schema_cache.columns_of(table)
|
|
312
|
+
)
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
if table.alias:
|
|
316
|
+
defining_nsp = self._lookup_namespace(table.alias)
|
|
317
|
+
if not defining_nsp and table.full_name:
|
|
318
|
+
# if we try to look up an aliased CTE, we need to use the full name instead
|
|
319
|
+
defining_nsp = self._lookup_namespace(table.full_name)
|
|
320
|
+
if not defining_nsp:
|
|
321
|
+
raise ParserError(f"No namespace found for table '{table}'")
|
|
322
|
+
self._output_shape.extend(defining_nsp._output_shape)
|
|
323
|
+
|
|
324
|
+
case _:
|
|
325
|
+
# do nothing, this is an expression that cannot be referenced later on!
|
|
326
|
+
pass
|
|
327
|
+
|
|
328
|
+
def register_table(self, table: TableReference) -> None:
|
|
329
|
+
"""Adds a "physical" table to the current namespace.
|
|
330
|
+
|
|
331
|
+
In truth, the table does not need to be physical, it can also be a CTE that was defined in an outer namespace and
|
|
332
|
+
is scanned here. "Physical" in this context means that the current namespace does not define the table itself.
|
|
333
|
+
"""
|
|
334
|
+
self._invalidate_column_cache()
|
|
335
|
+
self._current_ctx.append(table)
|
|
336
|
+
if table.alias:
|
|
337
|
+
self._table_sources[table.alias] = table
|
|
338
|
+
if table.full_name:
|
|
339
|
+
self._table_sources[table.full_name] = table
|
|
340
|
+
|
|
341
|
+
def provides_column(self, name: str) -> bool:
|
|
342
|
+
"""Checks, whether the current namespace has a specific column in its output relation."""
|
|
343
|
+
return name in self._output_shape
|
|
344
|
+
|
|
345
|
+
def lookup_column(self, key: str) -> Optional[TableReference]:
|
|
346
|
+
"""Searches for the table that provies a specific column.
|
|
347
|
+
|
|
348
|
+
This table can be either virtual, i.e. a subquery or CTE (possibly from an outer namespace), or an actual physical
|
|
349
|
+
table from the current database.
|
|
350
|
+
|
|
351
|
+
If no table is found , *None* is returned.
|
|
352
|
+
"""
|
|
353
|
+
cached_table = self._column_cache.get(key)
|
|
354
|
+
if cached_table:
|
|
355
|
+
return cached_table
|
|
356
|
+
|
|
357
|
+
matching_table: Optional[TableReference] = None
|
|
358
|
+
for table in self._current_ctx:
|
|
359
|
+
# later tables overwrite unqualified columns of earlier tables
|
|
360
|
+
physical_table = (
|
|
361
|
+
self._schema_cache.lookup_column(key, [table])
|
|
362
|
+
if not table.virtual
|
|
363
|
+
else None
|
|
364
|
+
)
|
|
365
|
+
if physical_table:
|
|
366
|
+
matching_table = table
|
|
367
|
+
break
|
|
368
|
+
|
|
369
|
+
subquery_nsp = self._subquery_children.get(table.identifier())
|
|
370
|
+
if subquery_nsp and subquery_nsp.provides_column(key):
|
|
371
|
+
matching_table = table
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
cte_nsp = self._cte_sources.get(table.identifier())
|
|
375
|
+
if cte_nsp and cte_nsp.provides_column(key):
|
|
376
|
+
matching_table = table
|
|
377
|
+
break
|
|
378
|
+
|
|
379
|
+
parent_nsp = self._lookup_namespace(table.identifier())
|
|
380
|
+
if parent_nsp and parent_nsp.provides_column(key):
|
|
381
|
+
matching_table = table
|
|
382
|
+
break
|
|
383
|
+
|
|
384
|
+
if not matching_table:
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
self._column_cache[key] = matching_table
|
|
388
|
+
return matching_table
|
|
389
|
+
|
|
390
|
+
def resolve_table(self, key: str) -> Optional[TableReference]:
|
|
391
|
+
"""Searches for the table that is referenced by a specific key.
|
|
392
|
+
|
|
393
|
+
The table can be either provided by this namespace (as a physical table in the *FROM* clause, or defined through a
|
|
394
|
+
subquery/CTE), or by an outer namspace.
|
|
395
|
+
"""
|
|
396
|
+
sourced_table = self._table_sources.get(key)
|
|
397
|
+
if sourced_table:
|
|
398
|
+
return sourced_table
|
|
399
|
+
|
|
400
|
+
if key in self._cte_sources:
|
|
401
|
+
return TableReference.create_virtual(key)
|
|
402
|
+
|
|
403
|
+
return self._parent.resolve_table(key) if self._parent else None
|
|
404
|
+
|
|
405
|
+
def open_nested(
|
|
406
|
+
self,
|
|
407
|
+
*,
|
|
408
|
+
alias: str = "",
|
|
409
|
+
source: Literal["cte", "subquery", "setop", "values", "temporary"],
|
|
410
|
+
) -> QueryNamespace:
|
|
411
|
+
"""Creates a new local namespace for a nested query.
|
|
412
|
+
|
|
413
|
+
Depending on the type of nested query, the namespace will be registered in different ways and used for different
|
|
414
|
+
purposes (see parameters below).
|
|
415
|
+
|
|
416
|
+
Parameters
|
|
417
|
+
----------
|
|
418
|
+
alias : str, optional
|
|
419
|
+
The name of the namespace. This is only relevant for CTEs and subqueries in the FROM clause.
|
|
420
|
+
source : Literal["cte", "subquery", "setop", "values", "temporary"]
|
|
421
|
+
The type of nested query. This value is used to determine the use of the subquery namespace as follows:
|
|
422
|
+
- "cte": The namespace is a CTE that is part of the query.
|
|
423
|
+
- "subquery": The namespace is a subquery in the FROM clause.
|
|
424
|
+
- "setop": The namespace is part of a set operation. No alias is required, but the namespace might be used to
|
|
425
|
+
determine the output shape of the current namespace
|
|
426
|
+
- "values": The namespace is a temporary table that is part of a VALUES clause.
|
|
427
|
+
- "temporary": The namespace is a temporary table that is part of a subquery which is not used in the FROM clause,
|
|
428
|
+
e.g. as a filter condition.
|
|
429
|
+
"""
|
|
430
|
+
if source != "temporary":
|
|
431
|
+
self._invalidate_column_cache()
|
|
432
|
+
|
|
433
|
+
child = QueryNamespace(parent=self)
|
|
434
|
+
|
|
435
|
+
match source:
|
|
436
|
+
case "cte":
|
|
437
|
+
self._cte_sources[alias] = child
|
|
438
|
+
case "subquery" | "values":
|
|
439
|
+
table = TableReference.create_virtual(alias)
|
|
440
|
+
self._subquery_children[alias] = child
|
|
441
|
+
self._current_ctx.append(table)
|
|
442
|
+
self._table_sources[alias] = table
|
|
443
|
+
case "setop":
|
|
444
|
+
self._setop_children.append(child)
|
|
445
|
+
case _:
|
|
446
|
+
# ignore other sources
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
return child
|
|
450
|
+
|
|
451
|
+
def _lookup_namespace(self, table_key: str) -> Optional[QueryNamespace]:
|
|
452
|
+
"""Searches for the (parent) namespace that provides a specific table."""
|
|
453
|
+
cte_nsp = self._cte_sources.get(table_key)
|
|
454
|
+
if cte_nsp:
|
|
455
|
+
return cte_nsp
|
|
456
|
+
|
|
457
|
+
subquery_nsp = self._subquery_children.get(table_key)
|
|
458
|
+
if subquery_nsp:
|
|
459
|
+
return subquery_nsp
|
|
460
|
+
|
|
461
|
+
return self._parent._lookup_namespace(table_key) if self._parent else None
|
|
462
|
+
|
|
463
|
+
def _invalidate_column_cache(self) -> None:
|
|
464
|
+
"""Clears all currently cached columns in case there is fear of a change in the column bindings."""
|
|
465
|
+
self._column_cache.clear()
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _pglast_is_actual_colref(pglast_data: dict) -> bool:
|
|
469
|
+
"""Checks, whether a apparent column reference is actually a column reference and not a star expression in disguise.
|
|
470
|
+
|
|
471
|
+
pglast represents both column references such as *R.a* or *a* as well as star expressions like *R.\\** as ``ColumnRef``
|
|
472
|
+
dictionaries, hence we need to make sure we are actually parsing the right thing. This method takes care of distinguishing
|
|
473
|
+
the two cases.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
pglast_data : dict
|
|
478
|
+
JSON encoding of the potential column
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
bool
|
|
483
|
+
*True* if this is an actual column reference, *False* if this is a star expression.
|
|
484
|
+
"""
|
|
485
|
+
fields: list[dict] = pglast_data["fields"]
|
|
486
|
+
if len(fields) == 1:
|
|
487
|
+
return "A_Star" not in fields[0]
|
|
488
|
+
if len(fields) == 2:
|
|
489
|
+
would_be_col: str = fields[1]
|
|
490
|
+
return "A_Star" not in would_be_col
|
|
491
|
+
|
|
492
|
+
would_be_col: str = fields[0]["String"]["sval"]
|
|
493
|
+
return not would_be_col.endswith("*")
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _pglast_create_bound_colref(
|
|
497
|
+
tab: str, col: str, *, namespace: QueryNamespace
|
|
498
|
+
) -> ColumnReference:
|
|
499
|
+
"""Creates a new reference to a column with known binding info.
|
|
500
|
+
|
|
501
|
+
Parameters
|
|
502
|
+
----------
|
|
503
|
+
tab : str
|
|
504
|
+
The table to which to bind
|
|
505
|
+
col : str
|
|
506
|
+
The column to bind
|
|
507
|
+
namespace : QueryNamespace
|
|
508
|
+
The tables and columns that are available in the current query.
|
|
509
|
+
|
|
510
|
+
Returns
|
|
511
|
+
-------
|
|
512
|
+
ColumnReference
|
|
513
|
+
The new column reference
|
|
514
|
+
"""
|
|
515
|
+
owning_table = namespace.resolve_table(tab)
|
|
516
|
+
if not owning_table:
|
|
517
|
+
raise ParserError("Table not found: " + tab)
|
|
518
|
+
parsed_column = ColumnReference(col, owning_table)
|
|
519
|
+
return parsed_column
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _pglast_parse_colref(
|
|
523
|
+
pglast_data: dict, *, namespace: QueryNamespace
|
|
524
|
+
) -> ColumnReference:
|
|
525
|
+
"""Handler method to parse column references in the query.
|
|
526
|
+
|
|
527
|
+
The column will be bound to its table if possible. This binding process uses the following rules:
|
|
528
|
+
|
|
529
|
+
- if the columns has already been resolved as part of an earlier parsing step in the same namespace, this column is re-used
|
|
530
|
+
- if the column is specified in qualified syntax (i.e. *table.column*), the table is directly inferred
|
|
531
|
+
- if the column is not qualified, but a `schema` is given, this schema is used together with the candidates from the
|
|
532
|
+
current namespace to lookup the owning table
|
|
533
|
+
- otherwise, the column is left unbound :(
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
----------
|
|
537
|
+
pglast_data : dict
|
|
538
|
+
JSON enconding of the column
|
|
539
|
+
namespace : QueryNamespace
|
|
540
|
+
The tables and columns that are available in the current query.
|
|
541
|
+
|
|
542
|
+
Returns
|
|
543
|
+
-------
|
|
544
|
+
ColumnReference
|
|
545
|
+
The parsed column reference.
|
|
546
|
+
"""
|
|
547
|
+
fields: list[dict] = pglast_data["fields"]
|
|
548
|
+
if len(fields) > 2:
|
|
549
|
+
raise ParserError("Unknown column reference format: " + str(pglast_data))
|
|
550
|
+
|
|
551
|
+
if len(fields) == 2:
|
|
552
|
+
tab, col = fields
|
|
553
|
+
return _pglast_create_bound_colref(
|
|
554
|
+
tab["String"]["sval"], col["String"]["sval"], namespace=namespace
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# at this point, we must have a single column parameter. It could be unbounded, or - if quoted - bounded
|
|
558
|
+
col: str = fields[0]["String"]["sval"]
|
|
559
|
+
owning_table = namespace.lookup_column(col)
|
|
560
|
+
return ColumnReference(col, owning_table)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _pglast_parse_star(
|
|
564
|
+
pglast_data: dict, *, namespace: QueryNamespace
|
|
565
|
+
) -> StarExpression:
|
|
566
|
+
"""Handler method to parse star expressions that are potentially bounded to a specific table, e.g. *R.\\**.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
pglast_data : dict
|
|
571
|
+
JSON enconding of the star expression
|
|
572
|
+
namespace : QueryNamespace
|
|
573
|
+
The tables and columns that are available in the current query.
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
StarExpression
|
|
578
|
+
The parsed star expression.
|
|
579
|
+
"""
|
|
580
|
+
fields = pglast_data["fields"]
|
|
581
|
+
if len(fields) == 1 and "A_Star" in fields[0]:
|
|
582
|
+
return StarExpression()
|
|
583
|
+
|
|
584
|
+
if len(fields) == 2:
|
|
585
|
+
tab = fields[0]["String"]["sval"]
|
|
586
|
+
return StarExpression(from_table=namespace.resolve_table(tab))
|
|
587
|
+
|
|
588
|
+
raise ParserError("Unknown star reference format: " + str(pglast_data))
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _pglast_parse_const(pglast_data: dict) -> StaticValueExpression:
|
|
592
|
+
"""Handler method to parse constant values in the query.
|
|
593
|
+
|
|
594
|
+
Parameters
|
|
595
|
+
----------
|
|
596
|
+
pglast_data : dict
|
|
597
|
+
JSON enconding of the value. This data is extracted from the pglast data structure.
|
|
598
|
+
|
|
599
|
+
Returns
|
|
600
|
+
-------
|
|
601
|
+
StaticValueExpression
|
|
602
|
+
The parsed constant value.
|
|
603
|
+
"""
|
|
604
|
+
pglast_data.pop("location", None)
|
|
605
|
+
valtype = util.dicts.key(pglast_data)
|
|
606
|
+
match valtype:
|
|
607
|
+
case "isnull":
|
|
608
|
+
return StaticValueExpression.null()
|
|
609
|
+
case "ival":
|
|
610
|
+
val = pglast_data["ival"]["ival"] if "ival" in pglast_data["ival"] else 0
|
|
611
|
+
return StaticValueExpression(val)
|
|
612
|
+
case "fval":
|
|
613
|
+
val = pglast_data["fval"]["fval"]
|
|
614
|
+
return StaticValueExpression(float(val))
|
|
615
|
+
case "sval":
|
|
616
|
+
return StaticValueExpression(pglast_data["sval"]["sval"])
|
|
617
|
+
case "boolval":
|
|
618
|
+
val = pglast_data["boolval"].get("boolval", False)
|
|
619
|
+
return StaticValueExpression(val)
|
|
620
|
+
case _:
|
|
621
|
+
raise ParserError("Unknown constant type: " + str(pglast_data))
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
_PglastOperatorMap: dict[str, SqlOperator] = {
|
|
625
|
+
"=": LogicalOperator.Equal,
|
|
626
|
+
"<": LogicalOperator.Less,
|
|
627
|
+
"<=": LogicalOperator.LessEqual,
|
|
628
|
+
">": LogicalOperator.Greater,
|
|
629
|
+
">=": LogicalOperator.GreaterEqual,
|
|
630
|
+
"<>": LogicalOperator.NotEqual,
|
|
631
|
+
"!=": LogicalOperator.NotEqual,
|
|
632
|
+
"AND_EXPR": CompoundOperator.And,
|
|
633
|
+
"OR_EXPR": CompoundOperator.Or,
|
|
634
|
+
"NOT_EXPR": CompoundOperator.Not,
|
|
635
|
+
"+": MathOperator.Add,
|
|
636
|
+
"-": MathOperator.Subtract,
|
|
637
|
+
"*": MathOperator.Multiply,
|
|
638
|
+
"/": MathOperator.Divide,
|
|
639
|
+
"%": MathOperator.Modulo,
|
|
640
|
+
"||": MathOperator.Concatenate,
|
|
641
|
+
}
|
|
642
|
+
"""Map from the internal representation of Postgres operators to our standardized QAL operators."""
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def _pglast_parse_operator(pglast_data: list[dict]) -> SqlOperator:
|
|
646
|
+
"""Handler method to parse operators into our query representation.
|
|
647
|
+
|
|
648
|
+
Parameters
|
|
649
|
+
----------
|
|
650
|
+
pglast_data : list[dict]
|
|
651
|
+
JSON enconding of the operator. This data is extracted from the pglast data structure.
|
|
652
|
+
|
|
653
|
+
Returns
|
|
654
|
+
-------
|
|
655
|
+
SqlOperator
|
|
656
|
+
The parsed operator.
|
|
657
|
+
"""
|
|
658
|
+
if len(pglast_data) != 1:
|
|
659
|
+
raise ParserError("Unknown operator format: " + str(pglast_data))
|
|
660
|
+
operator = pglast_data[0]
|
|
661
|
+
if "String" not in operator or "sval" not in operator["String"]:
|
|
662
|
+
raise ParserError("Unknown operator format: " + str(pglast_data))
|
|
663
|
+
sval = operator["String"]["sval"]
|
|
664
|
+
if sval not in _PglastOperatorMap:
|
|
665
|
+
raise ParserError("Operator not yet in target map: " + sval)
|
|
666
|
+
return _PglastOperatorMap[sval]
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
_PglastTypeMap: dict[str, str] = {
|
|
670
|
+
"bpchar": "char",
|
|
671
|
+
"serial8": "bigserial",
|
|
672
|
+
"int4": "integer",
|
|
673
|
+
"int2": "smallint",
|
|
674
|
+
"int8": "bigint",
|
|
675
|
+
"float4": "real",
|
|
676
|
+
"float8": "double precision",
|
|
677
|
+
"boolean": "bool",
|
|
678
|
+
}
|
|
679
|
+
"""Map from the internal representation of Postgres types to the SQL standard types."""
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def _pglast_parse_type(pglast_data: dict) -> str:
|
|
683
|
+
"""Handler method to parse type information from explicit type casts
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
pglast_data : dict
|
|
688
|
+
JSON encoding of the type information.
|
|
689
|
+
|
|
690
|
+
Returns
|
|
691
|
+
-------
|
|
692
|
+
str
|
|
693
|
+
The actual type
|
|
694
|
+
"""
|
|
695
|
+
if "names" not in pglast_data:
|
|
696
|
+
raise ParserError("Unknown type format: " + str(pglast_data))
|
|
697
|
+
names = pglast_data["names"]
|
|
698
|
+
if len(names) > 2:
|
|
699
|
+
raise ParserError("Unknown type format: " + str(pglast_data))
|
|
700
|
+
raw_type = names[-1]["String"]["sval"]
|
|
701
|
+
|
|
702
|
+
# for user-defined types we use get with the same type as argument
|
|
703
|
+
return _PglastTypeMap.get(raw_type, raw_type)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _pglast_parse_case(
|
|
707
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
708
|
+
) -> CaseExpression:
|
|
709
|
+
"""Handler method to parse *CASE* expressions in a query.
|
|
710
|
+
|
|
711
|
+
Parameters
|
|
712
|
+
----------
|
|
713
|
+
pglast_data : dict
|
|
714
|
+
JSON encoding of the *CASE* expression data. This data is extracted from the pglast data structure.
|
|
715
|
+
namespace : QueryNamespace
|
|
716
|
+
The tables and columns that are available in the current query.
|
|
717
|
+
query_txt : str
|
|
718
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
|
|
719
|
+
consider, such as hint blocks.
|
|
720
|
+
|
|
721
|
+
Returns
|
|
722
|
+
-------
|
|
723
|
+
CaseExpression
|
|
724
|
+
The parsed *CASE* expression.
|
|
725
|
+
"""
|
|
726
|
+
cases: list[tuple[AbstractPredicate, SqlExpression]] = []
|
|
727
|
+
for arg in pglast_data["args"]:
|
|
728
|
+
current_case = _pglast_parse_predicate(
|
|
729
|
+
arg["CaseWhen"]["expr"], namespace=namespace, query_txt=query_txt
|
|
730
|
+
)
|
|
731
|
+
current_result = _pglast_parse_expression(
|
|
732
|
+
arg["CaseWhen"]["result"], namespace=namespace, query_txt=query_txt
|
|
733
|
+
)
|
|
734
|
+
cases.append((current_case, current_result))
|
|
735
|
+
|
|
736
|
+
if "defresult" in pglast_data:
|
|
737
|
+
default_result = _pglast_parse_expression(
|
|
738
|
+
pglast_data["defresult"], namespace=namespace, query_txt=query_txt
|
|
739
|
+
)
|
|
740
|
+
else:
|
|
741
|
+
default_result = None
|
|
742
|
+
|
|
743
|
+
return CaseExpression(cases, else_expr=default_result)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _pglast_parse_expression(
|
|
747
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
748
|
+
) -> SqlExpression:
|
|
749
|
+
"""Handler method to parse arbitrary expressions in the query.
|
|
750
|
+
|
|
751
|
+
For some more complex expressions, this method will delegate to tailored parsing methods.
|
|
752
|
+
|
|
753
|
+
Parameters
|
|
754
|
+
----------
|
|
755
|
+
pglast_data : dict
|
|
756
|
+
JSON encoding of the expression data. This data is extracted from the pglast data structure.
|
|
757
|
+
namespace: QueryNamespace
|
|
758
|
+
The tables and columns that are available in the current query.
|
|
759
|
+
query_txt : str
|
|
760
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
|
|
761
|
+
consider, such as hint blocks.
|
|
762
|
+
|
|
763
|
+
Returns
|
|
764
|
+
-------
|
|
765
|
+
SqlExpression
|
|
766
|
+
The parsed expression.
|
|
767
|
+
"""
|
|
768
|
+
pglast_data.pop("location", None)
|
|
769
|
+
expression_key = util.dicts.key(pglast_data)
|
|
770
|
+
|
|
771
|
+
# When parsing the actual expression, we need to be aware that many expressions can actually be predicates, just not
|
|
772
|
+
# within the WHERE or HAVING clause. For example, "SELECT a IS NOT NULL FROM foo" is a perfectly valid query.
|
|
773
|
+
# Therefore, we handle a lot of expression cases by passing the input data back to our predicate parser and let it do the
|
|
774
|
+
# heavy lifting.
|
|
775
|
+
|
|
776
|
+
match expression_key:
|
|
777
|
+
case "ColumnRef" if _pglast_is_actual_colref(pglast_data["ColumnRef"]):
|
|
778
|
+
column = _pglast_parse_colref(pglast_data["ColumnRef"], namespace=namespace)
|
|
779
|
+
return ColumnExpression(column)
|
|
780
|
+
|
|
781
|
+
case "ColumnRef" if not _pglast_is_actual_colref(pglast_data["ColumnRef"]):
|
|
782
|
+
return _pglast_parse_star(pglast_data["ColumnRef"], namespace=namespace)
|
|
783
|
+
|
|
784
|
+
case "A_Const":
|
|
785
|
+
return _pglast_parse_const(pglast_data["A_Const"])
|
|
786
|
+
|
|
787
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_OP":
|
|
788
|
+
expression = pglast_data["A_Expr"]
|
|
789
|
+
operation = _pglast_parse_operator(expression["name"])
|
|
790
|
+
right = _pglast_parse_expression(
|
|
791
|
+
expression["rexpr"], namespace=namespace, query_txt=query_txt
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
if "lexpr" not in expression and operation in MathOperator:
|
|
795
|
+
return MathExpression(operation, right)
|
|
796
|
+
elif "lexpr" not in expression:
|
|
797
|
+
raise ParserError("Unknown operator format: " + str(expression))
|
|
798
|
+
|
|
799
|
+
left = _pglast_parse_expression(
|
|
800
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
if operation in LogicalOperator:
|
|
804
|
+
return BinaryPredicate(operation, left, right)
|
|
805
|
+
|
|
806
|
+
return MathExpression(operation, left, right)
|
|
807
|
+
|
|
808
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] in {
|
|
809
|
+
"AEXPR_LIKE",
|
|
810
|
+
"AEXPR_ILIKE",
|
|
811
|
+
"AEXPR_BETWEEN",
|
|
812
|
+
"AEXPR_IN",
|
|
813
|
+
}:
|
|
814
|
+
# we need to parse a predicate in disguise
|
|
815
|
+
predicate = _pglast_parse_predicate(
|
|
816
|
+
pglast_data, namespace=namespace, query_txt=query_txt
|
|
817
|
+
)
|
|
818
|
+
return predicate
|
|
819
|
+
|
|
820
|
+
case "NullTest":
|
|
821
|
+
predicate = _pglast_parse_predicate(
|
|
822
|
+
pglast_data, namespace=namespace, query_txt=query_txt
|
|
823
|
+
)
|
|
824
|
+
return predicate
|
|
825
|
+
|
|
826
|
+
case "BoolExpr":
|
|
827
|
+
predicate = _pglast_parse_predicate(
|
|
828
|
+
pglast_data, namespace=namespace, query_txt=query_txt
|
|
829
|
+
)
|
|
830
|
+
return predicate
|
|
831
|
+
|
|
832
|
+
case "FuncCall" if (
|
|
833
|
+
"over" not in pglast_data["FuncCall"]
|
|
834
|
+
): # normal functions, aggregates and UDFs
|
|
835
|
+
expression: dict = pglast_data["FuncCall"]
|
|
836
|
+
funcname = ".".join(
|
|
837
|
+
elem["String"]["sval"] for elem in expression["funcname"]
|
|
838
|
+
)
|
|
839
|
+
distinct = expression.get("agg_distinct", False)
|
|
840
|
+
if expression.get("agg_filter", False):
|
|
841
|
+
filter_expr = _pglast_parse_predicate(
|
|
842
|
+
expression["agg_filter"], namespace=namespace, query_txt=query_txt
|
|
843
|
+
)
|
|
844
|
+
else:
|
|
845
|
+
filter_expr = None
|
|
846
|
+
|
|
847
|
+
if expression.get("agg_star", False):
|
|
848
|
+
return FunctionExpression(
|
|
849
|
+
funcname,
|
|
850
|
+
[StarExpression()],
|
|
851
|
+
distinct=distinct,
|
|
852
|
+
filter_where=filter_expr,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
args = [
|
|
856
|
+
_pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
|
|
857
|
+
for arg in expression.get("args", [])
|
|
858
|
+
]
|
|
859
|
+
return FunctionExpression(
|
|
860
|
+
funcname, args, distinct=distinct, filter_where=filter_expr
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
case "FuncCall" if "over" in pglast_data["FuncCall"]: # window functions
|
|
864
|
+
expression: dict = pglast_data["FuncCall"]
|
|
865
|
+
funcname = ".".join(
|
|
866
|
+
elem["String"]["sval"] for elem in expression["funcname"]
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
args = [
|
|
870
|
+
_pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
|
|
871
|
+
for arg in expression.get("args", [])
|
|
872
|
+
]
|
|
873
|
+
fn = FunctionExpression(funcname, args)
|
|
874
|
+
|
|
875
|
+
window_spec: dict = expression["over"]
|
|
876
|
+
|
|
877
|
+
if "partitionClause" in window_spec:
|
|
878
|
+
partition = [
|
|
879
|
+
_pglast_parse_expression(
|
|
880
|
+
partition, namespace=namespace, query_txt=query_txt
|
|
881
|
+
)
|
|
882
|
+
for partition in window_spec["partitionClause"]
|
|
883
|
+
]
|
|
884
|
+
else:
|
|
885
|
+
partition = None
|
|
886
|
+
|
|
887
|
+
if "orderClause" in window_spec:
|
|
888
|
+
order = _pglast_parse_orderby(
|
|
889
|
+
window_spec["orderClause"], namespace=namespace, query_txt=query_txt
|
|
890
|
+
)
|
|
891
|
+
else:
|
|
892
|
+
order = None
|
|
893
|
+
|
|
894
|
+
if "agg_filter" in expression:
|
|
895
|
+
filter_expr = _pglast_parse_expression(
|
|
896
|
+
expression["agg_filter"], namespace=namespace, query_txt=query_txt
|
|
897
|
+
)
|
|
898
|
+
else:
|
|
899
|
+
filter_expr = None
|
|
900
|
+
|
|
901
|
+
return WindowExpression(
|
|
902
|
+
fn, partitioning=partition, ordering=order, filter_condition=filter_expr
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
case "CoalesceExpr":
|
|
906
|
+
expression = pglast_data["CoalesceExpr"]
|
|
907
|
+
args = [
|
|
908
|
+
_pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
|
|
909
|
+
for arg in expression["args"]
|
|
910
|
+
]
|
|
911
|
+
return FunctionExpression("coalesce", args)
|
|
912
|
+
|
|
913
|
+
case "TypeCast":
|
|
914
|
+
expression: dict = pglast_data["TypeCast"]
|
|
915
|
+
casted_expression = _pglast_parse_expression(
|
|
916
|
+
expression["arg"], namespace=namespace, query_txt=query_txt
|
|
917
|
+
)
|
|
918
|
+
target_type = _pglast_parse_type(expression["typeName"])
|
|
919
|
+
type_params = [
|
|
920
|
+
_pglast_parse_expression(
|
|
921
|
+
param, namespace=namespace, query_txt=query_txt
|
|
922
|
+
)
|
|
923
|
+
for param in expression["typeName"].get("typmods", [])
|
|
924
|
+
]
|
|
925
|
+
|
|
926
|
+
return CastExpression(
|
|
927
|
+
casted_expression, target_type, type_params=type_params
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
case "CaseExpr":
|
|
931
|
+
return _pglast_parse_case(
|
|
932
|
+
pglast_data["CaseExpr"], namespace=namespace, query_txt=query_txt
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
case "SubLink" if pglast_data["SubLink"]["subLinkType"] == "EXPR_SUBLINK":
|
|
936
|
+
subquery = _pglast_parse_query(
|
|
937
|
+
pglast_data["SubLink"]["subselect"]["SelectStmt"],
|
|
938
|
+
query_txt=query_txt,
|
|
939
|
+
namespace=namespace.open_nested(source="temporary"),
|
|
940
|
+
)
|
|
941
|
+
return SubqueryExpression(subquery)
|
|
942
|
+
|
|
943
|
+
case "A_Indirection":
|
|
944
|
+
expression: dict = pglast_data["A_Indirection"]
|
|
945
|
+
array_expression = _pglast_parse_expression(
|
|
946
|
+
expression["arg"], namespace=namespace, query_txt=query_txt
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
for index_expression in expression["indirection"]:
|
|
950
|
+
index_expression: dict = index_expression["A_Indices"]
|
|
951
|
+
|
|
952
|
+
if index_expression.get("is_slice", False):
|
|
953
|
+
lower = (
|
|
954
|
+
_pglast_parse_expression(
|
|
955
|
+
index_expression["lidx"],
|
|
956
|
+
namespace=namespace,
|
|
957
|
+
query_txt=query_txt,
|
|
958
|
+
)
|
|
959
|
+
if "lidx" in index_expression
|
|
960
|
+
else None
|
|
961
|
+
)
|
|
962
|
+
upper = (
|
|
963
|
+
_pglast_parse_expression(
|
|
964
|
+
index_expression["uidx"],
|
|
965
|
+
namespace=namespace,
|
|
966
|
+
query_txt=query_txt,
|
|
967
|
+
)
|
|
968
|
+
if "uidx" in index_expression
|
|
969
|
+
else None
|
|
970
|
+
)
|
|
971
|
+
array_expression = ArrayAccessExpression(
|
|
972
|
+
array_expression, lower_idx=lower, upper_idx=upper
|
|
973
|
+
)
|
|
974
|
+
continue
|
|
975
|
+
|
|
976
|
+
point_index = _pglast_parse_expression(
|
|
977
|
+
index_expression["uidx"], namespace=namespace, query_txt=query_txt
|
|
978
|
+
)
|
|
979
|
+
array_expression = ArrayAccessExpression(
|
|
980
|
+
array_expression, idx=point_index
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
return array_expression
|
|
984
|
+
|
|
985
|
+
case _:
|
|
986
|
+
raise ParserError("Unknown expression type: " + str(pglast_data))
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def _pglast_parse_values_cte(
|
|
990
|
+
pglast_data: dict, *, namespace: QueryNamespace
|
|
991
|
+
) -> tuple[ValuesList, list[str]]:
|
|
992
|
+
"""Handler method to parse a CTE with a *VALUES* expressions.
|
|
993
|
+
|
|
994
|
+
Parameters
|
|
995
|
+
----------
|
|
996
|
+
pglast_data : dict
|
|
997
|
+
JSON encoding of the CTE data. This data is extracted from the pglast data structure.
|
|
998
|
+
namespace: QueryNamespace
|
|
999
|
+
The tables and columns that are available in the current query.
|
|
1000
|
+
|
|
1001
|
+
Returns
|
|
1002
|
+
-------
|
|
1003
|
+
tuple[ValuesList, list[str]]
|
|
1004
|
+
The parsed *VALUES* expression and the column names.
|
|
1005
|
+
"""
|
|
1006
|
+
values: ValuesList = []
|
|
1007
|
+
for row in pglast_data["ctequery"]["SelectStmt"]["valuesLists"]:
|
|
1008
|
+
raw_items = row["List"]["items"]
|
|
1009
|
+
parsed_items = [
|
|
1010
|
+
_pglast_parse_expression(item, namespace=namespace) for item in raw_items
|
|
1011
|
+
]
|
|
1012
|
+
values.append(tuple(parsed_items))
|
|
1013
|
+
|
|
1014
|
+
colnames: list[str] = []
|
|
1015
|
+
for raw_colname in pglast_data.get("aliascolnames", []):
|
|
1016
|
+
colnames.append(raw_colname["String"]["sval"])
|
|
1017
|
+
|
|
1018
|
+
if colnames:
|
|
1019
|
+
namespace.determine_output_shape(colnames)
|
|
1020
|
+
|
|
1021
|
+
return values, colnames
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
def _pglast_parse_ctes(
|
|
1025
|
+
json_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
|
|
1026
|
+
) -> CommonTableExpression:
|
|
1027
|
+
"""Handler method to parse the *WITH* clause of a query.
|
|
1028
|
+
|
|
1029
|
+
Parameters
|
|
1030
|
+
----------
|
|
1031
|
+
json_data : dict
|
|
1032
|
+
JSON enconding of the CTEs, as extracted from the pglast data structure.
|
|
1033
|
+
parent_namespace: QueryNamespace
|
|
1034
|
+
The tables and columns that are available in the current query.
|
|
1035
|
+
query_txt: str
|
|
1036
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
|
|
1037
|
+
consider, such as hint blocks.
|
|
1038
|
+
|
|
1039
|
+
Returns
|
|
1040
|
+
-------
|
|
1041
|
+
CommonTableExpression
|
|
1042
|
+
The parsed CTEs.
|
|
1043
|
+
"""
|
|
1044
|
+
parsed_ctes: list[CommonTableExpression] = []
|
|
1045
|
+
for pglast_data in json_data["ctes"]:
|
|
1046
|
+
current_cte: dict = pglast_data["CommonTableExpr"]
|
|
1047
|
+
target_name = current_cte["ctename"]
|
|
1048
|
+
target_table = TableReference.create_virtual(target_name)
|
|
1049
|
+
|
|
1050
|
+
match current_cte.get("ctematerialized", "CTEMaterializeDefault"):
|
|
1051
|
+
case "CTEMaterializeDefault":
|
|
1052
|
+
force_materialization = None
|
|
1053
|
+
case "CTEMaterializeAlways":
|
|
1054
|
+
force_materialization = True
|
|
1055
|
+
case "CTEMaterializeNever":
|
|
1056
|
+
force_materialization = False
|
|
1057
|
+
|
|
1058
|
+
query_data = current_cte["ctequery"]["SelectStmt"]
|
|
1059
|
+
child_nsp = parent_namespace.open_nested(alias=target_name, source="cte")
|
|
1060
|
+
if "targetList" not in query_data and query_data["op"] == "SETOP_NONE":
|
|
1061
|
+
# CTE is a VALUES query
|
|
1062
|
+
values, columns = _pglast_parse_values_cte(current_cte, namespace=child_nsp)
|
|
1063
|
+
parsed_cte = ValuesWithQuery(
|
|
1064
|
+
values,
|
|
1065
|
+
target_name=target_table.identifier(),
|
|
1066
|
+
columns=columns,
|
|
1067
|
+
materialized=force_materialization,
|
|
1068
|
+
)
|
|
1069
|
+
else:
|
|
1070
|
+
cte_query = _pglast_parse_query(
|
|
1071
|
+
current_cte["ctequery"]["SelectStmt"],
|
|
1072
|
+
namespace=child_nsp,
|
|
1073
|
+
query_txt=query_txt,
|
|
1074
|
+
)
|
|
1075
|
+
parsed_cte = WithQuery(
|
|
1076
|
+
cte_query, target_table, materialized=force_materialization
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
parsed_ctes.append(parsed_cte)
|
|
1080
|
+
|
|
1081
|
+
recursive = json_data.get("recursive", False)
|
|
1082
|
+
return CommonTableExpression(parsed_ctes, recursive=recursive)
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
def _pglast_try_select_star(
|
|
1086
|
+
target: dict, *, distinct: list[SqlExpression] | bool
|
|
1087
|
+
) -> Optional[Select]:
|
|
1088
|
+
"""Attempts to generate a *SELECT(\\*)* representation for a *SELECT* clause.
|
|
1089
|
+
|
|
1090
|
+
If the query is not actually a *SELECT(\\*)* query, this method will return *None*.
|
|
1091
|
+
|
|
1092
|
+
Parameters
|
|
1093
|
+
----------
|
|
1094
|
+
target : dict
|
|
1095
|
+
JSON encoding of the target entry in the *SELECT* clause. This data is extracted from the pglast data structure
|
|
1096
|
+
distinct : list[SqlExpression] | bool
|
|
1097
|
+
The parsed *DISTINCT* part of the *SELECT* clause.
|
|
1098
|
+
|
|
1099
|
+
Returns
|
|
1100
|
+
-------
|
|
1101
|
+
Optional[Select]
|
|
1102
|
+
The parsed *SELECT(\\*)* clause, or *None* if this is not a *SELECT(\\*)* query.
|
|
1103
|
+
"""
|
|
1104
|
+
if "ColumnRef" not in target:
|
|
1105
|
+
return None
|
|
1106
|
+
fields = target["ColumnRef"]["fields"]
|
|
1107
|
+
if len(fields) != 1:
|
|
1108
|
+
# multiple fields are used for qualified column references. This is definitely not a SELECT * query, so exit
|
|
1109
|
+
return None
|
|
1110
|
+
colref = fields[0]
|
|
1111
|
+
return Select.star(distinct=distinct) if "A_Star" in colref else None
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
def _pglast_parse_select(
|
|
1115
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1116
|
+
) -> Select:
|
|
1117
|
+
"""Handler method to parse the *SELECT* clause of a query.
|
|
1118
|
+
|
|
1119
|
+
This is the only parsing handler that will always be called when parsing a query, since all queries must at least have a
|
|
1120
|
+
*SELECT* clause.
|
|
1121
|
+
|
|
1122
|
+
Parameters
|
|
1123
|
+
----------
|
|
1124
|
+
pglast_data : dict
|
|
1125
|
+
JSON encoding of the entire query. This is required to extract the different projections used in the *SELECT* clause,
|
|
1126
|
+
as well as potential required duplicate eliminations via *DISTINCT ON*
|
|
1127
|
+
namespace : QueryNamespace
|
|
1128
|
+
The tables and columns that are available in the current query.
|
|
1129
|
+
query_txt : str
|
|
1130
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
|
|
1131
|
+
consider, such as hint blocks.
|
|
1132
|
+
|
|
1133
|
+
Returns
|
|
1134
|
+
-------
|
|
1135
|
+
Select
|
|
1136
|
+
The parsed *SELECT* clause
|
|
1137
|
+
"""
|
|
1138
|
+
|
|
1139
|
+
pglast_distinct = pglast_data.get("distinctClause", None)
|
|
1140
|
+
if pglast_distinct is None:
|
|
1141
|
+
distinct = False # value not present --> no DISTINCT
|
|
1142
|
+
elif pglast_distinct == [{}]: # that is pglasts encoding of a plain DISTINCT
|
|
1143
|
+
distinct = True
|
|
1144
|
+
elif isinstance(pglast_distinct, list):
|
|
1145
|
+
distinct = [
|
|
1146
|
+
_pglast_parse_expression(expr, namespace=namespace, query_txt=query_txt)
|
|
1147
|
+
for expr in pglast_distinct
|
|
1148
|
+
]
|
|
1149
|
+
else:
|
|
1150
|
+
raise ParserError(f"Unknown DISTINCT format: {pglast_distinct}")
|
|
1151
|
+
|
|
1152
|
+
targetlist: list[dict] = pglast_data[
|
|
1153
|
+
"targetList"
|
|
1154
|
+
] # targetlist must always be present, so no need for .get()
|
|
1155
|
+
|
|
1156
|
+
# first, try for SELECT * queries
|
|
1157
|
+
if len(targetlist) == 1:
|
|
1158
|
+
target = targetlist[0]["ResTarget"]["val"]
|
|
1159
|
+
select_star = _pglast_try_select_star(target, distinct=distinct)
|
|
1160
|
+
|
|
1161
|
+
if select_star:
|
|
1162
|
+
namespace.determine_output_shape(select_star)
|
|
1163
|
+
return select_star
|
|
1164
|
+
# if this is not a SELECT * query, we can continue with the regular parsing
|
|
1165
|
+
|
|
1166
|
+
targets: list[BaseProjection] = []
|
|
1167
|
+
for target in targetlist:
|
|
1168
|
+
expression = _pglast_parse_expression(
|
|
1169
|
+
target["ResTarget"]["val"], namespace=namespace, query_txt=query_txt
|
|
1170
|
+
)
|
|
1171
|
+
alias = target["ResTarget"].get("name", "")
|
|
1172
|
+
projection = BaseProjection(expression, alias)
|
|
1173
|
+
targets.append(projection)
|
|
1174
|
+
|
|
1175
|
+
clause = Select(targets, distinct=distinct)
|
|
1176
|
+
namespace.determine_output_shape(clause)
|
|
1177
|
+
return clause
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
def _pglast_parse_rangevar(rangevar: dict) -> TableReference:
|
|
1181
|
+
"""Handler method to extract the `TableReference` from a *RangeVar* entry in the *FROM* clause.
|
|
1182
|
+
|
|
1183
|
+
Parameters
|
|
1184
|
+
----------
|
|
1185
|
+
rangevar : dict
|
|
1186
|
+
JSON encoding of the range variable, as extracted from the pglast data structure.
|
|
1187
|
+
|
|
1188
|
+
Returns
|
|
1189
|
+
-------
|
|
1190
|
+
TableReference
|
|
1191
|
+
The parsed table reference.
|
|
1192
|
+
"""
|
|
1193
|
+
name = rangevar["relname"]
|
|
1194
|
+
alias = rangevar["alias"]["aliasname"] if "alias" in rangevar else None
|
|
1195
|
+
schema = rangevar.get("schemaname", "")
|
|
1196
|
+
return TableReference(name, alias, schema=schema)
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
def _pglast_is_values_list(pglast_data: dict) -> bool:
|
|
1200
|
+
"""Checks, whether a pglast subquery representation refers to an actual subquery or a *VALUES* list.
|
|
1201
|
+
|
|
1202
|
+
Parameters
|
|
1203
|
+
----------
|
|
1204
|
+
pglast_data : dict
|
|
1205
|
+
JSON encoding of the subquery data
|
|
1206
|
+
|
|
1207
|
+
Returns
|
|
1208
|
+
-------
|
|
1209
|
+
bool
|
|
1210
|
+
*True* if the subquery encodes a *VALUES* list, *False* otherwise.
|
|
1211
|
+
"""
|
|
1212
|
+
query = pglast_data["subquery"]["SelectStmt"]
|
|
1213
|
+
return "valuesLists" in query
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
def _pglast_parse_from_entry(
|
|
1217
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1218
|
+
) -> TableSource:
|
|
1219
|
+
"""Handler method to parse individual entries in the *FROM* clause.
|
|
1220
|
+
|
|
1221
|
+
Parameters
|
|
1222
|
+
----------
|
|
1223
|
+
pglast_data : dict
|
|
1224
|
+
JSON enconding of the current entry in the *FROM* clause. This data is extracted from the pglast data structure.
|
|
1225
|
+
namespace: QueryNamespace
|
|
1226
|
+
The tables and columns that are available in the current query.
|
|
1227
|
+
query_txt: str
|
|
1228
|
+
The raw query string that was passed to the parser. This is used to extract information that the PG parser does not
|
|
1229
|
+
provide, such as hint blocks.
|
|
1230
|
+
|
|
1231
|
+
Returns
|
|
1232
|
+
-------
|
|
1233
|
+
TableSource
|
|
1234
|
+
The parsed table source.
|
|
1235
|
+
"""
|
|
1236
|
+
pglast_data.pop("location", None)
|
|
1237
|
+
entry_type = util.dicts.key(pglast_data)
|
|
1238
|
+
|
|
1239
|
+
match entry_type:
|
|
1240
|
+
case "RangeVar":
|
|
1241
|
+
table = _pglast_parse_rangevar(pglast_data["RangeVar"])
|
|
1242
|
+
|
|
1243
|
+
# If we specified a virtual table in a CTE, we will reference it later in some FROM clause. In this case,
|
|
1244
|
+
# we should not create a new table reference, but rather use the existing one.
|
|
1245
|
+
# But, if we alias the virtual table, we still need a new reference
|
|
1246
|
+
similar_table = namespace.resolve_table(table.full_name)
|
|
1247
|
+
if similar_table and similar_table.virtual and not table.alias:
|
|
1248
|
+
# a simple reference to the CTE
|
|
1249
|
+
namespace.register_table(similar_table)
|
|
1250
|
+
return DirectTableSource(similar_table)
|
|
1251
|
+
if similar_table and similar_table.virtual and table.alias:
|
|
1252
|
+
# an aliased reference to the CTE
|
|
1253
|
+
table = table.make_virtual()
|
|
1254
|
+
# TODO: should we also update the mapping of the full_name here?
|
|
1255
|
+
|
|
1256
|
+
namespace.register_table(table)
|
|
1257
|
+
return DirectTableSource(table)
|
|
1258
|
+
|
|
1259
|
+
case "JoinExpr":
|
|
1260
|
+
join_expr: dict = pglast_data["JoinExpr"]
|
|
1261
|
+
match join_expr["jointype"]:
|
|
1262
|
+
case "JOIN_INNER" if "quals" in join_expr:
|
|
1263
|
+
join_type = JoinType.InnerJoin
|
|
1264
|
+
case "JOIN_INNER" if "quals" not in join_expr:
|
|
1265
|
+
join_type = JoinType.CrossJoin
|
|
1266
|
+
case "JOIN_LEFT":
|
|
1267
|
+
join_type = JoinType.LeftJoin
|
|
1268
|
+
case "JOIN_RIGHT":
|
|
1269
|
+
join_type = JoinType.RightJoin
|
|
1270
|
+
case "JOIN_OUTER":
|
|
1271
|
+
join_type = JoinType.OuterJoin
|
|
1272
|
+
case "JOIN_FULL":
|
|
1273
|
+
join_type = JoinType.OuterJoin
|
|
1274
|
+
case _:
|
|
1275
|
+
raise ParserError("Unknown join type: " + join_expr["jointype"])
|
|
1276
|
+
|
|
1277
|
+
left = _pglast_parse_from_entry(
|
|
1278
|
+
join_expr["larg"], namespace=namespace, query_txt=query_txt
|
|
1279
|
+
)
|
|
1280
|
+
right = _pglast_parse_from_entry(
|
|
1281
|
+
join_expr["rarg"], namespace=namespace, query_txt=query_txt
|
|
1282
|
+
)
|
|
1283
|
+
if join_type == JoinType.CrossJoin:
|
|
1284
|
+
return JoinTableSource(left, right, join_type=JoinType.CrossJoin)
|
|
1285
|
+
|
|
1286
|
+
join_condition = _pglast_parse_predicate(
|
|
1287
|
+
join_expr["quals"], namespace=namespace, query_txt=query_txt
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
# we do not need to store new tables in available_tables here, since this is already handled by the recursion.
|
|
1291
|
+
return JoinTableSource(
|
|
1292
|
+
left, right, join_condition=join_condition, join_type=join_type
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
case "RangeSubselect" if _pglast_is_values_list(pglast_data["RangeSubselect"]):
|
|
1296
|
+
values_list = _pglast_parse_values(
|
|
1297
|
+
pglast_data["RangeSubselect"],
|
|
1298
|
+
parent_namespace=namespace,
|
|
1299
|
+
query_txt=query_txt,
|
|
1300
|
+
)
|
|
1301
|
+
return values_list
|
|
1302
|
+
|
|
1303
|
+
case "RangeSubselect":
|
|
1304
|
+
raw_subquery: dict = pglast_data["RangeSubselect"]
|
|
1305
|
+
is_lateral = raw_subquery.get("lateral", False)
|
|
1306
|
+
|
|
1307
|
+
if "alias" in raw_subquery:
|
|
1308
|
+
alias: str = raw_subquery["alias"]["aliasname"]
|
|
1309
|
+
else:
|
|
1310
|
+
alias = ""
|
|
1311
|
+
|
|
1312
|
+
child_nsp = namespace.open_nested(alias=alias, source="subquery")
|
|
1313
|
+
subquery = _pglast_parse_query(
|
|
1314
|
+
raw_subquery["subquery"]["SelectStmt"],
|
|
1315
|
+
namespace=child_nsp,
|
|
1316
|
+
query_txt=query_txt,
|
|
1317
|
+
)
|
|
1318
|
+
|
|
1319
|
+
subquery_source = SubqueryTableSource(
|
|
1320
|
+
subquery, target_name=alias, lateral=is_lateral
|
|
1321
|
+
)
|
|
1322
|
+
return subquery_source
|
|
1323
|
+
|
|
1324
|
+
case "RangeFunction":
|
|
1325
|
+
raw_function: dict = pglast_data["RangeFunction"]
|
|
1326
|
+
|
|
1327
|
+
if "alias" in raw_function:
|
|
1328
|
+
alias: str = raw_function["alias"]["aliasname"]
|
|
1329
|
+
else:
|
|
1330
|
+
alias = ""
|
|
1331
|
+
|
|
1332
|
+
function_expr: dict = raw_function["functions"][0]["List"]["items"][0]
|
|
1333
|
+
parsed_function = _pglast_parse_expression(
|
|
1334
|
+
function_expr, namespace=namespace, query_txt=query_txt
|
|
1335
|
+
)
|
|
1336
|
+
return FunctionTableSource(parsed_function, alias=alias)
|
|
1337
|
+
|
|
1338
|
+
case _:
|
|
1339
|
+
raise ParserError("Unknow FROM clause entry: " + str(pglast_data))
|
|
1340
|
+
|
|
1341
|
+
|
|
1342
|
+
def _pglast_parse_values(
|
|
1343
|
+
pglast_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
|
|
1344
|
+
) -> ValuesTableSource:
|
|
1345
|
+
"""Handler method to parse explicit *VALUES* lists in the *FROM* clause.
|
|
1346
|
+
|
|
1347
|
+
Parameters
|
|
1348
|
+
----------
|
|
1349
|
+
pglast_data : dict
|
|
1350
|
+
JSON encoding of the actual *VALUES* list. This data is extracted from the pglast data structure and should be akin
|
|
1351
|
+
to a subquery.
|
|
1352
|
+
parent_namespace : QueryNamespace
|
|
1353
|
+
The tables and columns that are available in the current query. This is only used to register the columns of the
|
|
1354
|
+
VALUES list
|
|
1355
|
+
query_txt : str
|
|
1356
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1357
|
+
parser ignores, such as hint blocks.
|
|
1358
|
+
|
|
1359
|
+
Returns
|
|
1360
|
+
-------
|
|
1361
|
+
ValuesTableSource
|
|
1362
|
+
The parsed *VALUES* list.
|
|
1363
|
+
"""
|
|
1364
|
+
|
|
1365
|
+
raw_alias: dict = pglast_data.get("alias", {})
|
|
1366
|
+
alias = raw_alias.get("aliasname", "")
|
|
1367
|
+
child_nsp = parent_namespace.open_nested(alias=alias, source="values")
|
|
1368
|
+
raw_values: list[dict] = pglast_data["subquery"]["SelectStmt"]["valuesLists"]
|
|
1369
|
+
|
|
1370
|
+
values: ValuesList = []
|
|
1371
|
+
for row in raw_values:
|
|
1372
|
+
raw_items = row["List"]["items"]
|
|
1373
|
+
parsed_items = [
|
|
1374
|
+
_pglast_parse_expression(item, namespace=child_nsp, query_txt=query_txt)
|
|
1375
|
+
for item in raw_items
|
|
1376
|
+
]
|
|
1377
|
+
values.append(tuple(parsed_items))
|
|
1378
|
+
|
|
1379
|
+
if not alias:
|
|
1380
|
+
return ValuesTableSource(values, alias=alias, columns=[])
|
|
1381
|
+
|
|
1382
|
+
if "colnames" not in raw_alias:
|
|
1383
|
+
return ValuesTableSource(values, alias=alias, columns=[])
|
|
1384
|
+
|
|
1385
|
+
colnames = []
|
|
1386
|
+
for raw_colname in raw_alias["colnames"]:
|
|
1387
|
+
colnames.append(raw_colname["String"]["sval"])
|
|
1388
|
+
table_source = ValuesTableSource(values, alias=alias, columns=colnames)
|
|
1389
|
+
child_nsp.determine_output_shape(table_source.cols)
|
|
1390
|
+
return table_source
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
def _pglast_parse_from(
|
|
1394
|
+
from_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
|
|
1395
|
+
) -> From:
|
|
1396
|
+
"""Handler method to parse the *FROM* clause of a query.
|
|
1397
|
+
|
|
1398
|
+
Parameters
|
|
1399
|
+
----------
|
|
1400
|
+
from_clause : list[dict]
|
|
1401
|
+
The JSON representation of the *FROM* clause, as extracted from the pglast data structure.
|
|
1402
|
+
namespace : QueryNamespace
|
|
1403
|
+
The tables and columns that are available in the current query.
|
|
1404
|
+
query_txt : str
|
|
1405
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1406
|
+
parser ignores, such as hint blocks.
|
|
1407
|
+
|
|
1408
|
+
Returns
|
|
1409
|
+
-------
|
|
1410
|
+
From
|
|
1411
|
+
The parsed *FROM* clause.
|
|
1412
|
+
"""
|
|
1413
|
+
contains_plain_table = False
|
|
1414
|
+
contains_join = False
|
|
1415
|
+
contains_mixed = False # plain tables and explicit JOINs, subqueries or VALUES
|
|
1416
|
+
|
|
1417
|
+
table_sources: list[TableSource] = []
|
|
1418
|
+
for entry in from_clause:
|
|
1419
|
+
current_table_source = _pglast_parse_from_entry(
|
|
1420
|
+
entry, namespace=namespace, query_txt=query_txt
|
|
1421
|
+
)
|
|
1422
|
+
table_sources.append(current_table_source)
|
|
1423
|
+
|
|
1424
|
+
match current_table_source:
|
|
1425
|
+
case DirectTableSource():
|
|
1426
|
+
contains_plain_table = True
|
|
1427
|
+
if contains_join:
|
|
1428
|
+
contains_mixed = True
|
|
1429
|
+
case JoinTableSource():
|
|
1430
|
+
contains_join = True
|
|
1431
|
+
if contains_plain_table:
|
|
1432
|
+
contains_mixed = True
|
|
1433
|
+
case SubqueryTableSource():
|
|
1434
|
+
contains_mixed = True
|
|
1435
|
+
case ValuesTableSource():
|
|
1436
|
+
contains_mixed = True
|
|
1437
|
+
|
|
1438
|
+
if not contains_join and not contains_mixed:
|
|
1439
|
+
return ImplicitFromClause(table_sources)
|
|
1440
|
+
if contains_join and not contains_mixed:
|
|
1441
|
+
return ExplicitFromClause(table_sources)
|
|
1442
|
+
|
|
1443
|
+
return From(table_sources)
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def _pglast_parse_predicate(
|
|
1447
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1448
|
+
) -> AbstractPredicate:
|
|
1449
|
+
"""Handler method to parse arbitrary predicates in the *WHERE* or *HAVING* clause.
|
|
1450
|
+
|
|
1451
|
+
Parameters
|
|
1452
|
+
----------
|
|
1453
|
+
pglast_data : dict
|
|
1454
|
+
JSON encoding of the predicate data. This data is extracted from the pglast data structure.
|
|
1455
|
+
namespace : QueryNamespace
|
|
1456
|
+
The tables and columns that are available in the current query.
|
|
1457
|
+
query_txt : str
|
|
1458
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1459
|
+
parser ignores, such as hint blocks.
|
|
1460
|
+
|
|
1461
|
+
|
|
1462
|
+
Returns
|
|
1463
|
+
-------
|
|
1464
|
+
AbstractPredicate
|
|
1465
|
+
The parsed predicate.
|
|
1466
|
+
"""
|
|
1467
|
+
pglast_data.pop("location", None)
|
|
1468
|
+
expr_key = util.dicts.key(pglast_data)
|
|
1469
|
+
match expr_key:
|
|
1470
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_OP":
|
|
1471
|
+
expression = pglast_data["A_Expr"]
|
|
1472
|
+
operator = _pglast_parse_operator(expression["name"])
|
|
1473
|
+
left = _pglast_parse_expression(
|
|
1474
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
1475
|
+
)
|
|
1476
|
+
right = _pglast_parse_expression(
|
|
1477
|
+
expression["rexpr"], namespace=namespace, query_txt=query_txt
|
|
1478
|
+
)
|
|
1479
|
+
return BinaryPredicate(operator, left, right)
|
|
1480
|
+
|
|
1481
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_LIKE":
|
|
1482
|
+
expression = pglast_data["A_Expr"]
|
|
1483
|
+
operator = (
|
|
1484
|
+
LogicalOperator.Like
|
|
1485
|
+
if expression["name"][0]["String"]["sval"] == "~~"
|
|
1486
|
+
else LogicalOperator.NotLike
|
|
1487
|
+
)
|
|
1488
|
+
left = _pglast_parse_expression(
|
|
1489
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
1490
|
+
)
|
|
1491
|
+
right = _pglast_parse_expression(
|
|
1492
|
+
expression["rexpr"], namespace=namespace, query_txt=query_txt
|
|
1493
|
+
)
|
|
1494
|
+
return BinaryPredicate(operator, left, right)
|
|
1495
|
+
|
|
1496
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_ILIKE":
|
|
1497
|
+
expression = pglast_data["A_Expr"]
|
|
1498
|
+
operator = (
|
|
1499
|
+
LogicalOperator.ILike
|
|
1500
|
+
if expression["name"][0]["String"]["sval"] == "~~*"
|
|
1501
|
+
else LogicalOperator.NotILike
|
|
1502
|
+
)
|
|
1503
|
+
left = _pglast_parse_expression(
|
|
1504
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
1505
|
+
)
|
|
1506
|
+
right = _pglast_parse_expression(
|
|
1507
|
+
expression["rexpr"], namespace=namespace, query_txt=query_txt
|
|
1508
|
+
)
|
|
1509
|
+
return BinaryPredicate(operator, left, right)
|
|
1510
|
+
|
|
1511
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_BETWEEN":
|
|
1512
|
+
expression = pglast_data["A_Expr"]
|
|
1513
|
+
left = _pglast_parse_expression(
|
|
1514
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
1515
|
+
)
|
|
1516
|
+
raw_interval = expression["rexpr"]["List"]["items"]
|
|
1517
|
+
if len(raw_interval) != 2:
|
|
1518
|
+
raise ParserError("Invalid BETWEEN interval: " + str(raw_interval))
|
|
1519
|
+
lower = _pglast_parse_expression(
|
|
1520
|
+
raw_interval[0], namespace=namespace, query_txt=query_txt
|
|
1521
|
+
)
|
|
1522
|
+
upper = _pglast_parse_expression(
|
|
1523
|
+
raw_interval[1], namespace=namespace, query_txt=query_txt
|
|
1524
|
+
)
|
|
1525
|
+
return BetweenPredicate(left, (lower, upper))
|
|
1526
|
+
|
|
1527
|
+
case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_IN":
|
|
1528
|
+
expression = pglast_data["A_Expr"]
|
|
1529
|
+
left = _pglast_parse_expression(
|
|
1530
|
+
expression["lexpr"], namespace=namespace, query_txt=query_txt
|
|
1531
|
+
)
|
|
1532
|
+
raw_values = expression["rexpr"]["List"]["items"]
|
|
1533
|
+
values = [
|
|
1534
|
+
_pglast_parse_expression(
|
|
1535
|
+
value, namespace=namespace, query_txt=query_txt
|
|
1536
|
+
)
|
|
1537
|
+
for value in raw_values
|
|
1538
|
+
]
|
|
1539
|
+
predicate = InPredicate(left, values)
|
|
1540
|
+
operator = expression["name"][0]["String"]["sval"]
|
|
1541
|
+
if operator == "=":
|
|
1542
|
+
return predicate
|
|
1543
|
+
elif operator == "<>":
|
|
1544
|
+
return CompoundPredicate.create_not(predicate)
|
|
1545
|
+
else:
|
|
1546
|
+
raise ParserError("Invalid IN operator: " + operator)
|
|
1547
|
+
|
|
1548
|
+
case "BoolExpr":
|
|
1549
|
+
expression = pglast_data["BoolExpr"]
|
|
1550
|
+
operator = _PglastOperatorMap[expression["boolop"]]
|
|
1551
|
+
children = [
|
|
1552
|
+
_pglast_parse_predicate(child, namespace=namespace, query_txt=query_txt)
|
|
1553
|
+
for child in expression["args"]
|
|
1554
|
+
]
|
|
1555
|
+
return CompoundPredicate(operator, children)
|
|
1556
|
+
|
|
1557
|
+
case "NullTest":
|
|
1558
|
+
expression = pglast_data["NullTest"]
|
|
1559
|
+
testexpr = _pglast_parse_expression(
|
|
1560
|
+
expression["arg"], namespace=namespace, query_txt=query_txt
|
|
1561
|
+
)
|
|
1562
|
+
operation = (
|
|
1563
|
+
LogicalOperator.Is
|
|
1564
|
+
if expression["nulltesttype"] == "IS_NULL"
|
|
1565
|
+
else LogicalOperator.IsNot
|
|
1566
|
+
)
|
|
1567
|
+
return BinaryPredicate(operation, testexpr, StaticValueExpression.null())
|
|
1568
|
+
|
|
1569
|
+
case "FuncCall":
|
|
1570
|
+
expression = _pglast_parse_expression(
|
|
1571
|
+
pglast_data, namespace=namespace, query_txt=query_txt
|
|
1572
|
+
)
|
|
1573
|
+
return UnaryPredicate(expression)
|
|
1574
|
+
|
|
1575
|
+
case "SubLink":
|
|
1576
|
+
expression = pglast_data["SubLink"]
|
|
1577
|
+
sublink_type = expression["subLinkType"]
|
|
1578
|
+
|
|
1579
|
+
subquery = _pglast_parse_query(
|
|
1580
|
+
expression["subselect"]["SelectStmt"],
|
|
1581
|
+
namespace=namespace.open_nested(source="temporary"),
|
|
1582
|
+
query_txt=query_txt,
|
|
1583
|
+
)
|
|
1584
|
+
if sublink_type == "EXISTS_SUBLINK":
|
|
1585
|
+
return UnaryPredicate.exists(subquery)
|
|
1586
|
+
|
|
1587
|
+
testexpr = _pglast_parse_expression(
|
|
1588
|
+
expression["testexpr"], namespace=namespace, query_txt=query_txt
|
|
1589
|
+
)
|
|
1590
|
+
|
|
1591
|
+
if sublink_type == "ANY_SUBLINK" and "operName" not in expression:
|
|
1592
|
+
return InPredicate.subquery(testexpr, subquery)
|
|
1593
|
+
|
|
1594
|
+
if sublink_type == "ANY_SUBLINK":
|
|
1595
|
+
operator = _PglastOperatorMap[expression["operName"]]
|
|
1596
|
+
subquery_expression = FunctionExpression.any_func(subquery)
|
|
1597
|
+
return BinaryPredicate(operator, testexpr, subquery_expression)
|
|
1598
|
+
elif sublink_type == "ALL_SUBLINK":
|
|
1599
|
+
operator = _PglastOperatorMap[expression["operName"]]
|
|
1600
|
+
subquery_expression = FunctionExpression.all_func(subquery)
|
|
1601
|
+
return BinaryPredicate(operator, testexpr, subquery_expression)
|
|
1602
|
+
else:
|
|
1603
|
+
raise NotImplementedError("Subquery handling is not yet implemented")
|
|
1604
|
+
|
|
1605
|
+
case _:
|
|
1606
|
+
expression = _pglast_parse_expression(
|
|
1607
|
+
pglast_data, namespace=namespace, query_txt=query_txt
|
|
1608
|
+
)
|
|
1609
|
+
return UnaryPredicate(expression)
|
|
1610
|
+
|
|
1611
|
+
|
|
1612
|
+
def _pglast_parse_where(
|
|
1613
|
+
where_clause: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1614
|
+
) -> Where:
|
|
1615
|
+
"""Handler method to parse the *WHERE* clause of a query.
|
|
1616
|
+
|
|
1617
|
+
Parameters
|
|
1618
|
+
----------
|
|
1619
|
+
where_clause : dict
|
|
1620
|
+
The JSON representation of the *WHERE* clause, as extracted from the pglast data structure.
|
|
1621
|
+
namespace: QueryNamespace
|
|
1622
|
+
The tables and columns that can be referenced by expressions in the query.
|
|
1623
|
+
query_txt : str
|
|
1624
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1625
|
+
parser ignores, such as hint blocks.
|
|
1626
|
+
|
|
1627
|
+
Returns
|
|
1628
|
+
-------
|
|
1629
|
+
Where
|
|
1630
|
+
The parsed *WHERE* clause.
|
|
1631
|
+
"""
|
|
1632
|
+
predicate = _pglast_parse_predicate(
|
|
1633
|
+
where_clause, namespace=namespace, query_txt=query_txt
|
|
1634
|
+
)
|
|
1635
|
+
return Where(predicate)
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
def _pglast_parse_groupby(
|
|
1639
|
+
groupby_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
|
|
1640
|
+
) -> GroupBy:
|
|
1641
|
+
"""Handler method to parse the *GROUP BY* clause of a query.
|
|
1642
|
+
|
|
1643
|
+
Parameters
|
|
1644
|
+
----------
|
|
1645
|
+
groupby_clause : list[dict]
|
|
1646
|
+
The JSON representation of the *GROUP BY* clause, as extracted from the pglast data structure
|
|
1647
|
+
namespace: QueryNamespace
|
|
1648
|
+
The tables and columns that can be referenced by expressions in the query
|
|
1649
|
+
query_txt : str
|
|
1650
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1651
|
+
parser ignores, such as hint blocks.
|
|
1652
|
+
|
|
1653
|
+
Returns
|
|
1654
|
+
-------
|
|
1655
|
+
GroupBy
|
|
1656
|
+
The parsed *GROUP BY* clause.
|
|
1657
|
+
"""
|
|
1658
|
+
groupings: list[SqlExpression] = []
|
|
1659
|
+
|
|
1660
|
+
for item in groupby_clause:
|
|
1661
|
+
if "GroupingSet" in item:
|
|
1662
|
+
raise NotImplementedError("Grouping sets are not yet supported")
|
|
1663
|
+
group_expression = _pglast_parse_expression(
|
|
1664
|
+
item, namespace=namespace, query_txt=query_txt
|
|
1665
|
+
)
|
|
1666
|
+
groupings.append(group_expression)
|
|
1667
|
+
|
|
1668
|
+
return GroupBy(groupings)
|
|
1669
|
+
|
|
1670
|
+
|
|
1671
|
+
def _pglast_parse_having(
|
|
1672
|
+
having_clause: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1673
|
+
) -> Having:
|
|
1674
|
+
"""Handler method to parse the *HAVING* clause of a query.
|
|
1675
|
+
|
|
1676
|
+
Parameters
|
|
1677
|
+
----------
|
|
1678
|
+
having_clause : dict
|
|
1679
|
+
The JSON representation of the *HAVING* clause, as extracted from the pglast data structure.
|
|
1680
|
+
namespace: QueryNamespace
|
|
1681
|
+
The tables and columns that can be referenced by expressions in the query.
|
|
1682
|
+
query_txt : str
|
|
1683
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1684
|
+
parser ignores, such as hint blocks.
|
|
1685
|
+
|
|
1686
|
+
Returns
|
|
1687
|
+
-------
|
|
1688
|
+
Having
|
|
1689
|
+
The parsed *HAVING* clause.
|
|
1690
|
+
"""
|
|
1691
|
+
predicate = _pglast_parse_predicate(
|
|
1692
|
+
having_clause, namespace=namespace, query_txt=query_txt
|
|
1693
|
+
)
|
|
1694
|
+
return Having(predicate)
|
|
1695
|
+
|
|
1696
|
+
|
|
1697
|
+
def _pglast_parse_orderby(
|
|
1698
|
+
order_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
|
|
1699
|
+
) -> OrderBy:
|
|
1700
|
+
"""Handler method to parse the *ORDER BY* clause of a query.
|
|
1701
|
+
|
|
1702
|
+
Parameters
|
|
1703
|
+
----------
|
|
1704
|
+
order_clause : list[dict]
|
|
1705
|
+
The JSON representation of the *ORDER BY* clause, as extracted from the pglast data structure.
|
|
1706
|
+
namespace : QueryNamespace
|
|
1707
|
+
The tables and columns that can be referenced by expressions in the query.
|
|
1708
|
+
query_txt : str
|
|
1709
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1710
|
+
parser ignores, such as hint blocks.
|
|
1711
|
+
|
|
1712
|
+
Returns
|
|
1713
|
+
-------
|
|
1714
|
+
OrderBy
|
|
1715
|
+
The parsed *ORDER BY* clause.
|
|
1716
|
+
"""
|
|
1717
|
+
orderings: list[OrderByExpression] = []
|
|
1718
|
+
|
|
1719
|
+
for item in order_clause:
|
|
1720
|
+
expression = item["SortBy"]
|
|
1721
|
+
sort_key = _pglast_parse_expression(
|
|
1722
|
+
expression["node"], namespace=namespace, query_txt=query_txt
|
|
1723
|
+
)
|
|
1724
|
+
|
|
1725
|
+
match expression["sortby_dir"]:
|
|
1726
|
+
case "SORTBY_ASC":
|
|
1727
|
+
sort_ascending = True
|
|
1728
|
+
case "SORTBY_DESC":
|
|
1729
|
+
sort_ascending = False
|
|
1730
|
+
case "SORTBY_DEFAULT":
|
|
1731
|
+
sort_ascending = None
|
|
1732
|
+
case _:
|
|
1733
|
+
raise ParserError("Unknown sort direction: " + expression["sortby_dir"])
|
|
1734
|
+
|
|
1735
|
+
match expression["sortby_nulls"]:
|
|
1736
|
+
case "SORTBY_NULLS_FIRST":
|
|
1737
|
+
put_nulls_first = True
|
|
1738
|
+
case "SORTBY_NULLS_LAST":
|
|
1739
|
+
put_nulls_first = False
|
|
1740
|
+
case "SORTBY_NULLS_DEFAULT":
|
|
1741
|
+
put_nulls_first = None
|
|
1742
|
+
case _:
|
|
1743
|
+
raise ParserError(
|
|
1744
|
+
"Unknown nulls placement: " + expression["sortby_nulls"]
|
|
1745
|
+
)
|
|
1746
|
+
|
|
1747
|
+
order_expression = OrderByExpression(
|
|
1748
|
+
sort_key, ascending=sort_ascending, nulls_first=put_nulls_first
|
|
1749
|
+
)
|
|
1750
|
+
orderings.append(order_expression)
|
|
1751
|
+
|
|
1752
|
+
return OrderBy(orderings)
|
|
1753
|
+
|
|
1754
|
+
|
|
1755
|
+
def _pglast_parse_limit(
|
|
1756
|
+
pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1757
|
+
) -> Optional[Limit]:
|
|
1758
|
+
"""Handler method to parse LIMIT and OFFSET clauses.
|
|
1759
|
+
|
|
1760
|
+
This method assumes that the given query actually contains *LIMIT* or *OFFSET* clauses and will fail otherwise.
|
|
1761
|
+
|
|
1762
|
+
Parameters
|
|
1763
|
+
----------
|
|
1764
|
+
pglast_data : dict
|
|
1765
|
+
JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
|
|
1766
|
+
of the ``SelectStmt`` is necessary.
|
|
1767
|
+
namespace : QueryNamespace
|
|
1768
|
+
The tables and columns that can be referenced by expressions in the query.
|
|
1769
|
+
query_txt : str
|
|
1770
|
+
The raw query text that was passed to the parser. This is used to extract information from the query that the PG
|
|
1771
|
+
parser ignores, such as hint blocks.
|
|
1772
|
+
|
|
1773
|
+
Returns
|
|
1774
|
+
-------
|
|
1775
|
+
Limit
|
|
1776
|
+
The limit clause. Can be *None* if no meaningful limit nor a meaningful offset is specified.
|
|
1777
|
+
"""
|
|
1778
|
+
raw_limit: Optional[dict] = pglast_data.get("limitCount", None)
|
|
1779
|
+
raw_offset: Optional[dict] = pglast_data.get("limitOffset", None)
|
|
1780
|
+
if raw_limit is None and raw_offset is None:
|
|
1781
|
+
return None
|
|
1782
|
+
|
|
1783
|
+
if raw_limit is not None:
|
|
1784
|
+
# for LIMIT ALL there is no second ival, but instead an "isnull" member that is set to true
|
|
1785
|
+
raw_limit = raw_limit["A_Const"]["ival"]
|
|
1786
|
+
nrows: int | None = raw_limit["ival"] if "ival" in raw_limit else None
|
|
1787
|
+
else:
|
|
1788
|
+
nrows = None
|
|
1789
|
+
if raw_offset is not None:
|
|
1790
|
+
offset: int = raw_offset["A_Const"]["ival"].get("ival", 0)
|
|
1791
|
+
else:
|
|
1792
|
+
offset = None
|
|
1793
|
+
|
|
1794
|
+
normalized_query = query_txt.lower()
|
|
1795
|
+
contains_standard_limit = (
|
|
1796
|
+
"limit" in normalized_query or "fetch first" in normalized_query
|
|
1797
|
+
)
|
|
1798
|
+
if raw_limit is not None and not contains_standard_limit:
|
|
1799
|
+
if "fetch next" in normalized_query:
|
|
1800
|
+
fetch_direction = "next"
|
|
1801
|
+
elif "fetch prior" in normalized_query:
|
|
1802
|
+
fetch_direction = "prior"
|
|
1803
|
+
elif "fetch last" in normalized_query:
|
|
1804
|
+
fetch_direction = "last"
|
|
1805
|
+
else:
|
|
1806
|
+
raise ParserError("Could not determine LIMIT option")
|
|
1807
|
+
else:
|
|
1808
|
+
fetch_direction = "first"
|
|
1809
|
+
|
|
1810
|
+
return Limit(limit=nrows, offset=offset, fetch_direction=fetch_direction)
|
|
1811
|
+
|
|
1812
|
+
|
|
1813
|
+
def _pglast_parse_setop(
|
|
1814
|
+
pglast_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
|
|
1815
|
+
) -> SetQuery:
|
|
1816
|
+
"""Handler method to parse set operations.
|
|
1817
|
+
|
|
1818
|
+
This method assumes that the given query is indeed a set operation and will fail otherwise.
|
|
1819
|
+
|
|
1820
|
+
Parameters
|
|
1821
|
+
----------
|
|
1822
|
+
pglast_data : dict
|
|
1823
|
+
JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
|
|
1824
|
+
of the ``SelectStmt`` is necessary.
|
|
1825
|
+
parent_namespace : QueryNamespace
|
|
1826
|
+
The tables and columns that can be referenced by expressions in the query.
|
|
1827
|
+
query_txt : str
|
|
1828
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser ignores, such
|
|
1829
|
+
as hint blocks.
|
|
1830
|
+
|
|
1831
|
+
Returns
|
|
1832
|
+
-------
|
|
1833
|
+
SetOperationClause
|
|
1834
|
+
The parsed set clause
|
|
1835
|
+
"""
|
|
1836
|
+
if "withClause" in pglast_data:
|
|
1837
|
+
with_clause = _pglast_parse_ctes(
|
|
1838
|
+
pglast_data["withClause"],
|
|
1839
|
+
parent_namespace=parent_namespace,
|
|
1840
|
+
query_txt=query_txt,
|
|
1841
|
+
)
|
|
1842
|
+
else:
|
|
1843
|
+
with_clause = None
|
|
1844
|
+
|
|
1845
|
+
left_query = _pglast_parse_query(
|
|
1846
|
+
pglast_data["larg"],
|
|
1847
|
+
namespace=parent_namespace.open_nested(source="setop"),
|
|
1848
|
+
query_txt=query_txt,
|
|
1849
|
+
)
|
|
1850
|
+
right_query = _pglast_parse_query(
|
|
1851
|
+
pglast_data["rarg"],
|
|
1852
|
+
namespace=parent_namespace.open_nested(source="setop"),
|
|
1853
|
+
query_txt=query_txt,
|
|
1854
|
+
)
|
|
1855
|
+
|
|
1856
|
+
match pglast_data["op"]:
|
|
1857
|
+
case "SETOP_UNION":
|
|
1858
|
+
operator = (
|
|
1859
|
+
SetOperator.UnionAll
|
|
1860
|
+
if pglast_data.get("all", False)
|
|
1861
|
+
else SetOperator.Union
|
|
1862
|
+
)
|
|
1863
|
+
case "SETOP_INTERSECT":
|
|
1864
|
+
operator = SetOperator.Intersect
|
|
1865
|
+
case "SETOP_EXCEPT":
|
|
1866
|
+
operator = SetOperator.Except
|
|
1867
|
+
case _:
|
|
1868
|
+
raise ParserError("Unknown set operation: " + pglast_data["op"])
|
|
1869
|
+
|
|
1870
|
+
if "sortClause" in pglast_data:
|
|
1871
|
+
order_clause = _pglast_parse_orderby(
|
|
1872
|
+
pglast_data["sortClause"], namespace=parent_namespace, query_txt=query_txt
|
|
1873
|
+
)
|
|
1874
|
+
else:
|
|
1875
|
+
order_clause = None
|
|
1876
|
+
|
|
1877
|
+
if pglast_data["limitOption"] == "LIMIT_OPTION_COUNT":
|
|
1878
|
+
limit_clause = _pglast_parse_limit(
|
|
1879
|
+
pglast_data, namespace=parent_namespace, query_txt=query_txt
|
|
1880
|
+
)
|
|
1881
|
+
else:
|
|
1882
|
+
limit_clause = None
|
|
1883
|
+
|
|
1884
|
+
parent_namespace.determine_output_shape(None)
|
|
1885
|
+
return SetQuery(
|
|
1886
|
+
left_query,
|
|
1887
|
+
right_query,
|
|
1888
|
+
set_operation=operator,
|
|
1889
|
+
cte_clause=with_clause,
|
|
1890
|
+
orderby_clause=order_clause,
|
|
1891
|
+
limit_clause=limit_clause,
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1894
|
+
|
|
1895
|
+
def _pglast_parse_explain(pglast_data: dict) -> tuple[Optional[Explain], dict]:
|
|
1896
|
+
"""Handler method to extract the *EXPLAIN* clause from a query.
|
|
1897
|
+
|
|
1898
|
+
Parameters
|
|
1899
|
+
----------
|
|
1900
|
+
pglast_data : dict
|
|
1901
|
+
JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
|
|
1902
|
+
of the dictionary is necessary.
|
|
1903
|
+
|
|
1904
|
+
Returns
|
|
1905
|
+
-------
|
|
1906
|
+
tuple[Optional[Explain], dict]
|
|
1907
|
+
The parsed explain clause if one exists as well as the wrapped query. The query representation should be used for all
|
|
1908
|
+
further parsing steps.
|
|
1909
|
+
"""
|
|
1910
|
+
if "ExplainStmt" not in pglast_data:
|
|
1911
|
+
return None, pglast_data
|
|
1912
|
+
|
|
1913
|
+
pglast_data = pglast_data["ExplainStmt"]
|
|
1914
|
+
explain_options: list[dict] = pglast_data.get("options", [])
|
|
1915
|
+
|
|
1916
|
+
use_analyze = False
|
|
1917
|
+
output_format = "TEXT"
|
|
1918
|
+
for option in explain_options:
|
|
1919
|
+
definition: dict = option["DefElem"]
|
|
1920
|
+
match definition["defname"]:
|
|
1921
|
+
case "analyze":
|
|
1922
|
+
use_analyze = True
|
|
1923
|
+
case "format":
|
|
1924
|
+
output_format = definition["arg"]["String"]["sval"]
|
|
1925
|
+
case _:
|
|
1926
|
+
raise ParserError("Unknown explain option: " + str(definition))
|
|
1927
|
+
|
|
1928
|
+
explain_clause = Explain(use_analyze, output_format)
|
|
1929
|
+
return explain_clause, pglast_data["query"]
|
|
1930
|
+
|
|
1931
|
+
|
|
1932
|
+
def _pglast_parse_query(
|
|
1933
|
+
stmt: dict, *, namespace: QueryNamespace, query_txt: str
|
|
1934
|
+
) -> SelectStatement:
|
|
1935
|
+
"""Main entry point into the parsing logic.
|
|
1936
|
+
|
|
1937
|
+
This function takes a single SQL SELECT query and provides the corresponding `SqlQuery` object.
|
|
1938
|
+
While parsing the different expressions, columns are automatically bound to their tables if they use qualified names.
|
|
1939
|
+
Otherwise, they are inferred from the database schema if one is given. If no schema is provided, the column will be
|
|
1940
|
+
left unbound.
|
|
1941
|
+
|
|
1942
|
+
Parameters
|
|
1943
|
+
----------
|
|
1944
|
+
stmt : dict
|
|
1945
|
+
The JSON representation of the query. This should be the contents of the ``SelectStmt`` key in the JSON dictionary.
|
|
1946
|
+
namespace : QueryNamespace
|
|
1947
|
+
The tables and columns that can be referenced by expressions in the query. This is used to register the columns
|
|
1948
|
+
that are used in the query.
|
|
1949
|
+
query_txt : str
|
|
1950
|
+
The raw query text that was passed to the parser. This is used to extract information that the PG parser ignores, such
|
|
1951
|
+
as hint blocks.
|
|
1952
|
+
|
|
1953
|
+
Returns
|
|
1954
|
+
-------
|
|
1955
|
+
SelectStatement
|
|
1956
|
+
The parsed query
|
|
1957
|
+
"""
|
|
1958
|
+
if stmt["op"] != "SETOP_NONE":
|
|
1959
|
+
return _pglast_parse_setop(
|
|
1960
|
+
stmt, parent_namespace=namespace, query_txt=query_txt
|
|
1961
|
+
)
|
|
1962
|
+
|
|
1963
|
+
clauses: list[BaseClause] = []
|
|
1964
|
+
|
|
1965
|
+
if "withClause" in stmt:
|
|
1966
|
+
with_clause = _pglast_parse_ctes(
|
|
1967
|
+
stmt["withClause"], parent_namespace=namespace, query_txt=query_txt
|
|
1968
|
+
)
|
|
1969
|
+
clauses.append(with_clause)
|
|
1970
|
+
|
|
1971
|
+
if "fromClause" in stmt:
|
|
1972
|
+
from_clause = _pglast_parse_from(
|
|
1973
|
+
stmt["fromClause"], namespace=namespace, query_txt=query_txt
|
|
1974
|
+
)
|
|
1975
|
+
clauses.append(from_clause)
|
|
1976
|
+
|
|
1977
|
+
# Each query is guaranteed to have a SELECT clause, so we can just parse it straight away
|
|
1978
|
+
select_clause = _pglast_parse_select(stmt, namespace=namespace, query_txt=query_txt)
|
|
1979
|
+
clauses.append(select_clause)
|
|
1980
|
+
|
|
1981
|
+
if "whereClause" in stmt:
|
|
1982
|
+
where_clause = _pglast_parse_where(
|
|
1983
|
+
stmt["whereClause"], namespace=namespace, query_txt=query_txt
|
|
1984
|
+
)
|
|
1985
|
+
clauses.append(where_clause)
|
|
1986
|
+
|
|
1987
|
+
if "groupClause" in stmt:
|
|
1988
|
+
group_clause = _pglast_parse_groupby(
|
|
1989
|
+
stmt["groupClause"], namespace=namespace, query_txt=query_txt
|
|
1990
|
+
)
|
|
1991
|
+
clauses.append(group_clause)
|
|
1992
|
+
|
|
1993
|
+
if "havingClause" in stmt:
|
|
1994
|
+
having_clause = _pglast_parse_having(
|
|
1995
|
+
stmt["havingClause"], namespace=namespace, query_txt=query_txt
|
|
1996
|
+
)
|
|
1997
|
+
clauses.append(having_clause)
|
|
1998
|
+
|
|
1999
|
+
if "sortClause" in stmt:
|
|
2000
|
+
order_clause = _pglast_parse_orderby(
|
|
2001
|
+
stmt["sortClause"], namespace=namespace, query_txt=query_txt
|
|
2002
|
+
)
|
|
2003
|
+
clauses.append(order_clause)
|
|
2004
|
+
|
|
2005
|
+
if stmt["limitOption"] == "LIMIT_OPTION_COUNT":
|
|
2006
|
+
limit_clause = _pglast_parse_limit(
|
|
2007
|
+
stmt, namespace=namespace, query_txt=query_txt
|
|
2008
|
+
)
|
|
2009
|
+
clauses.append(limit_clause)
|
|
2010
|
+
|
|
2011
|
+
return build_query(clauses)
|
|
2012
|
+
|
|
2013
|
+
|
|
2014
|
+
def _pglast_parse_set_commands(pglast_data: list[dict]) -> tuple[list[str], list[dict]]:
|
|
2015
|
+
"""Handler method to parse all *SET* commands that precede the actual query.
|
|
2016
|
+
|
|
2017
|
+
Parameters
|
|
2018
|
+
----------
|
|
2019
|
+
pglast_data : list[dict]
|
|
2020
|
+
JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
|
|
2021
|
+
of the dictionary is necessary.
|
|
2022
|
+
|
|
2023
|
+
Returns
|
|
2024
|
+
-------
|
|
2025
|
+
tuple[list[str], list[dict]]
|
|
2026
|
+
The parsed *SET* commands as a list of strings and the remaining query data. The query data is "forwarded" to the first
|
|
2027
|
+
encoding that does not represent a *SET* command.
|
|
2028
|
+
"""
|
|
2029
|
+
prep_stmts: list[str] = []
|
|
2030
|
+
|
|
2031
|
+
for i, item in enumerate(pglast_data):
|
|
2032
|
+
stmt: dict = item["stmt"]
|
|
2033
|
+
if "VariableSetStmt" not in stmt:
|
|
2034
|
+
break
|
|
2035
|
+
|
|
2036
|
+
var_set_stmt: dict = stmt["VariableSetStmt"]
|
|
2037
|
+
if var_set_stmt["kind"] != "VAR_SET_VALUE":
|
|
2038
|
+
raise ParserError(f"Unknown variable set option: {var_set_stmt}")
|
|
2039
|
+
var_name = var_set_stmt["name"]
|
|
2040
|
+
var_value = var_set_stmt["args"][0]["A_Const"]["sval"]["sval"]
|
|
2041
|
+
|
|
2042
|
+
parsed_stmt = f"SET {var_name} TO '{var_value}';"
|
|
2043
|
+
prep_stmts.append(parsed_stmt)
|
|
2044
|
+
|
|
2045
|
+
return prep_stmts, pglast_data[i:]
|
|
2046
|
+
|
|
2047
|
+
|
|
2048
|
+
def _parse_hint_block(
|
|
2049
|
+
raw_query: str, *, set_cmds: list[str], _current_hint_text: list[str] = None
|
|
2050
|
+
) -> Optional[Hint]:
|
|
2051
|
+
"""Handler method to extract the hint block (i.e. preceding comments) from a query
|
|
2052
|
+
|
|
2053
|
+
Parameters
|
|
2054
|
+
----------
|
|
2055
|
+
raw_query : str
|
|
2056
|
+
The query text that was passed to the parser. We require access to the raw query, because the PG parser ignores all
|
|
2057
|
+
comments and does not represent them in the AST in any way.
|
|
2058
|
+
set_cmds: list[str]
|
|
2059
|
+
*SET* commands that have already been parsed. These will be added to the hint block.
|
|
2060
|
+
_current_hint_text : list[str], optional
|
|
2061
|
+
Internal parameter to keep track of the current hint text. This is used because the parsing logic uses a recursive
|
|
2062
|
+
implementation.
|
|
2063
|
+
|
|
2064
|
+
Returns
|
|
2065
|
+
-------
|
|
2066
|
+
Optional[Hint]
|
|
2067
|
+
The hint block if any hints were found, or *None* otherwise.
|
|
2068
|
+
"""
|
|
2069
|
+
_current_hint_text = _current_hint_text or []
|
|
2070
|
+
|
|
2071
|
+
raw_query = raw_query.lstrip()
|
|
2072
|
+
block_hint = raw_query.startswith("/*")
|
|
2073
|
+
line_hint = raw_query.startswith("--")
|
|
2074
|
+
if not block_hint and not line_hint:
|
|
2075
|
+
prep_stms = "\n".join(set_cmds)
|
|
2076
|
+
hints = "\n".join(_current_hint_text)
|
|
2077
|
+
return Hint(prep_stms, hints) if prep_stms or hints else None
|
|
2078
|
+
|
|
2079
|
+
if line_hint:
|
|
2080
|
+
line_end = raw_query.find("\n")
|
|
2081
|
+
if line_end == -1:
|
|
2082
|
+
# should never be raised b/c parsing should have failed already at this point
|
|
2083
|
+
raise ParserError(f"Unterminated line comment: {raw_query}")
|
|
2084
|
+
|
|
2085
|
+
line_comment = raw_query[:line_end].strip()
|
|
2086
|
+
_current_hint_text.append(line_comment)
|
|
2087
|
+
return _parse_hint_block(
|
|
2088
|
+
raw_query[line_end:],
|
|
2089
|
+
set_cmds=set_cmds,
|
|
2090
|
+
_current_hint_text=_current_hint_text,
|
|
2091
|
+
)
|
|
2092
|
+
|
|
2093
|
+
# must be block hint
|
|
2094
|
+
block_end = raw_query.find("*/")
|
|
2095
|
+
if block_end == -1:
|
|
2096
|
+
# should never be raised b/c parsing should have failed already at this point
|
|
2097
|
+
raise ParserError(f"Unterminated block comment: {raw_query}")
|
|
2098
|
+
|
|
2099
|
+
block_comment = raw_query[: block_end + 2].strip()
|
|
2100
|
+
_current_hint_text.append(block_comment)
|
|
2101
|
+
return _parse_hint_block(
|
|
2102
|
+
raw_query[block_end + 2 :],
|
|
2103
|
+
set_cmds=set_cmds,
|
|
2104
|
+
_current_hint_text=_current_hint_text,
|
|
2105
|
+
)
|
|
2106
|
+
|
|
2107
|
+
|
|
2108
|
+
def _apply_extra_clauses(
|
|
2109
|
+
parsed: SelectStatement, *, hint: Optional[Hint], explain_clause: Optional[Explain]
|
|
2110
|
+
) -> SelectStatement:
|
|
2111
|
+
clauses = list(parsed.clauses())
|
|
2112
|
+
if hint:
|
|
2113
|
+
clauses.append(hint)
|
|
2114
|
+
if explain_clause:
|
|
2115
|
+
clauses.append(explain_clause)
|
|
2116
|
+
return build_query(clauses)
|
|
2117
|
+
|
|
2118
|
+
|
|
2119
|
+
@overload
|
|
2120
|
+
def parse_query(
|
|
2121
|
+
query: str,
|
|
2122
|
+
*,
|
|
2123
|
+
include_hints: bool = True,
|
|
2124
|
+
bind_columns: bool | None = None,
|
|
2125
|
+
db_schema: Optional[DBCatalog] = None,
|
|
2126
|
+
) -> SqlQuery: ...
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
@overload
|
|
2130
|
+
def parse_query(
|
|
2131
|
+
query: str,
|
|
2132
|
+
*,
|
|
2133
|
+
accept_set_query: bool,
|
|
2134
|
+
include_hints: bool = True,
|
|
2135
|
+
bind_columns: Optional[bool] = None,
|
|
2136
|
+
db_schema: Optional[DBCatalog] = None,
|
|
2137
|
+
) -> SelectStatement: ...
|
|
2138
|
+
|
|
2139
|
+
|
|
2140
|
+
def parse_query(
|
|
2141
|
+
query: str,
|
|
2142
|
+
*,
|
|
2143
|
+
accept_set_query: bool = False,
|
|
2144
|
+
include_hints: bool = True,
|
|
2145
|
+
bind_columns: Optional[bool] = None,
|
|
2146
|
+
db_schema: Optional[DBCatalog] = None,
|
|
2147
|
+
) -> SelectStatement:
|
|
2148
|
+
"""Parses a query string into a proper `SqlQuery` object.
|
|
2149
|
+
|
|
2150
|
+
During parsing, the appropriate type of SQL query (i.e. with implicit, explicit or mixed *FROM* clause) will be
|
|
2151
|
+
inferred automatically. Therefore, this method can potentially return a subclass of `SqlQuery`.
|
|
2152
|
+
|
|
2153
|
+
Once the query has been transformed, a text-based binding process is executed. During this process, the referenced
|
|
2154
|
+
tables are normalized such that column references using the table alias are linked to the correct tables that are
|
|
2155
|
+
specified in the *FROM* clause (see the module-level documentation for an example). The parsing process can
|
|
2156
|
+
optionally also involve a binding process based on the schema of a live database. This is important for all
|
|
2157
|
+
remaining columns where the text-based parsing was not possible, e.g. because the column was specified without a
|
|
2158
|
+
table alias.
|
|
2159
|
+
|
|
2160
|
+
Parameters
|
|
2161
|
+
----------
|
|
2162
|
+
query : str
|
|
2163
|
+
The query to parse
|
|
2164
|
+
accept_set_query : bool, optional
|
|
2165
|
+
Whether set queries are a valid result of the parsing process. If this is *False* (the default), an error will be
|
|
2166
|
+
raised if the input query is a set query. This implies that the result of the parsing process is always a `SqlQuery`
|
|
2167
|
+
instance. Otherwise, the result can also be a `SetQuery` instance.
|
|
2168
|
+
include_hints : bool, optional
|
|
2169
|
+
Whether to include hints in the parsed query. If this is *True* (the default), any preceding comments in the query
|
|
2170
|
+
text will be parsed as a hint block. Otherwise, these comments are simply ignored.
|
|
2171
|
+
bind_columns : bool | None, optional
|
|
2172
|
+
Whether to use *live binding*. This does not control the text-based binding, which is always performed. If this
|
|
2173
|
+
parameter is *None* (the default), the global `auto_bind_columns` variable will be queried. Depending on its
|
|
2174
|
+
value, live binding will be performed or not.
|
|
2175
|
+
db_schema : Optional[DBCatalog], optional
|
|
2176
|
+
For live binding, this indicates the database to use. If this is *None* (the default), the database will be
|
|
2177
|
+
tried to extract from the `DatabasePool`
|
|
2178
|
+
|
|
2179
|
+
Returns
|
|
2180
|
+
-------
|
|
2181
|
+
SqlQuery
|
|
2182
|
+
The parsed SQL query.
|
|
2183
|
+
"""
|
|
2184
|
+
# NOTE: this documentation is a 1:1 copy of qal.parse_query. Both should be kept in sync.
|
|
2185
|
+
if not query:
|
|
2186
|
+
raise ParserError("Empty query")
|
|
2187
|
+
|
|
2188
|
+
if db_schema is None and (
|
|
2189
|
+
bind_columns or (bind_columns is None and auto_bind_columns)
|
|
2190
|
+
):
|
|
2191
|
+
from ..db import DatabasePool # local import to prevent circular imports
|
|
2192
|
+
|
|
2193
|
+
db_schema = (
|
|
2194
|
+
None
|
|
2195
|
+
if DatabasePool.get_instance().empty()
|
|
2196
|
+
else DatabasePool.get_instance().current_database().schema()
|
|
2197
|
+
)
|
|
2198
|
+
|
|
2199
|
+
pglast_data = json.loads(pglast.parser.parse_sql_json(query))
|
|
2200
|
+
stmts = pglast_data["stmts"]
|
|
2201
|
+
|
|
2202
|
+
set_cmds, stmts = _pglast_parse_set_commands(stmts)
|
|
2203
|
+
if len(stmts) != 1:
|
|
2204
|
+
raise ValueError("Parser can only support single-statement queries for now")
|
|
2205
|
+
raw_query: dict = stmts[0]["stmt"]
|
|
2206
|
+
|
|
2207
|
+
if "ExplainStmt" in raw_query:
|
|
2208
|
+
explain_clause, raw_query = _pglast_parse_explain(raw_query)
|
|
2209
|
+
else:
|
|
2210
|
+
explain_clause = None
|
|
2211
|
+
|
|
2212
|
+
if "SelectStmt" not in raw_query:
|
|
2213
|
+
raise ValueError("Cannot parse non-SELECT queries")
|
|
2214
|
+
stmt = raw_query["SelectStmt"]
|
|
2215
|
+
|
|
2216
|
+
parsed_query = _pglast_parse_query(
|
|
2217
|
+
stmt, namespace=QueryNamespace.empty(db_schema), query_txt=query
|
|
2218
|
+
)
|
|
2219
|
+
if not accept_set_query and isinstance(query, SetQuery):
|
|
2220
|
+
raise ParserError("Input query is a set query")
|
|
2221
|
+
|
|
2222
|
+
hint = _parse_hint_block(query, set_cmds=set_cmds) if include_hints else None
|
|
2223
|
+
parsed_query = _apply_extra_clauses(
|
|
2224
|
+
parsed_query, hint=hint, explain_clause=explain_clause
|
|
2225
|
+
)
|
|
2226
|
+
|
|
2227
|
+
return parsed_query
|
|
2228
|
+
|
|
2229
|
+
|
|
2230
|
+
class ParserError(RuntimeError):
|
|
2231
|
+
"""An error that is raised when parsing fails."""
|
|
2232
|
+
|
|
2233
|
+
def __init__(self, msg: str) -> None:
|
|
2234
|
+
super().__init__(msg)
|
|
2235
|
+
|
|
2236
|
+
|
|
2237
|
+
def load_table_json(json_data: dict | str) -> Optional[TableReference]:
|
|
2238
|
+
"""Re-creates a table reference from its JSON encoding.
|
|
2239
|
+
|
|
2240
|
+
Parameters
|
|
2241
|
+
----------
|
|
2242
|
+
json_data : dict | str
|
|
2243
|
+
Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
|
|
2244
|
+
|
|
2245
|
+
Returns
|
|
2246
|
+
-------
|
|
2247
|
+
Optional[TableReference]
|
|
2248
|
+
The actual table. If the dictionary is empty or otherwise invalid, *None* is returned.
|
|
2249
|
+
"""
|
|
2250
|
+
if not json_data:
|
|
2251
|
+
return None
|
|
2252
|
+
json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
|
|
2253
|
+
return TableReference(
|
|
2254
|
+
json_data.get("full_name", ""),
|
|
2255
|
+
json_data.get("alias", ""),
|
|
2256
|
+
virtual=json_data.get("virtual", False),
|
|
2257
|
+
schema=json_data.get("schemaname", None),
|
|
2258
|
+
)
|
|
2259
|
+
|
|
2260
|
+
|
|
2261
|
+
def load_column_json(json_data: dict | str) -> Optional[ColumnReference]:
|
|
2262
|
+
"""Re-creates a column reference from its JSON encoding.
|
|
2263
|
+
|
|
2264
|
+
Parameters
|
|
2265
|
+
----------
|
|
2266
|
+
json_data : dict | str
|
|
2267
|
+
Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
|
|
2268
|
+
|
|
2269
|
+
Returns
|
|
2270
|
+
-------
|
|
2271
|
+
Optional[ColumnReference]
|
|
2272
|
+
The actual column. It the dictionary is empty or otherwise invalid, *None* is returned.
|
|
2273
|
+
"""
|
|
2274
|
+
if not json_data:
|
|
2275
|
+
return None
|
|
2276
|
+
json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
|
|
2277
|
+
return ColumnReference(
|
|
2278
|
+
json_data.get("column"), load_table_json(json_data.get("table", None))
|
|
2279
|
+
)
|
|
2280
|
+
|
|
2281
|
+
|
|
2282
|
+
def load_expression_json(json_data: dict | str) -> Optional[SqlExpression]:
|
|
2283
|
+
"""Re-creates an arbitrary SQL expression from its JSON encoding.
|
|
2284
|
+
|
|
2285
|
+
Parameters
|
|
2286
|
+
----------
|
|
2287
|
+
json_data : dict | str
|
|
2288
|
+
Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
|
|
2289
|
+
|
|
2290
|
+
Returns
|
|
2291
|
+
-------
|
|
2292
|
+
Optional[SqlExpression]
|
|
2293
|
+
The actual expression. If the dictionary is empty or *None*, *None* is returned. Notice that in case of
|
|
2294
|
+
malformed data, errors are raised.
|
|
2295
|
+
"""
|
|
2296
|
+
if not json_data:
|
|
2297
|
+
return None
|
|
2298
|
+
json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
|
|
2299
|
+
|
|
2300
|
+
tables = [load_table_json(table_data) for table_data in json_data.get("tables", [])]
|
|
2301
|
+
expression_str = json_data["expression"]
|
|
2302
|
+
if not tables:
|
|
2303
|
+
emulated_query = f"SELECT {expression_str}"
|
|
2304
|
+
else:
|
|
2305
|
+
from_clause_str = ", ".join(str(tab) for tab in tables)
|
|
2306
|
+
emulated_query = f"SELECT {expression_str} FROM {from_clause_str}"
|
|
2307
|
+
|
|
2308
|
+
parsed_query = parse_query(emulated_query)
|
|
2309
|
+
return parsed_query.select_clause.targets[0].expression
|
|
2310
|
+
|
|
2311
|
+
|
|
2312
|
+
def load_predicate_json(json_data: dict | str) -> Optional[AbstractPredicate]:
|
|
2313
|
+
"""Re-creates an arbitrary predicate from its JSON encoding.
|
|
2314
|
+
|
|
2315
|
+
Parameters
|
|
2316
|
+
----------
|
|
2317
|
+
json_data : dict | str
|
|
2318
|
+
Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
|
|
2319
|
+
|
|
2320
|
+
Returns
|
|
2321
|
+
-------
|
|
2322
|
+
Optional[AbstractPredicate]
|
|
2323
|
+
The actual predicate. If the dictionary is empty or *None*, *None* is returned. Notice that in case of
|
|
2324
|
+
malformed data, errors are raised.
|
|
2325
|
+
|
|
2326
|
+
Raises
|
|
2327
|
+
------
|
|
2328
|
+
KeyError
|
|
2329
|
+
If the encoding does not specify the tables that are referenced in the predicate
|
|
2330
|
+
KeyError
|
|
2331
|
+
If the encoding does not contain the actual predicate
|
|
2332
|
+
"""
|
|
2333
|
+
if not json_data:
|
|
2334
|
+
return None
|
|
2335
|
+
json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
|
|
2336
|
+
|
|
2337
|
+
tables = [load_table_json(table_data) for table_data in json_data.get("tables", [])]
|
|
2338
|
+
if not tables:
|
|
2339
|
+
raise KeyError("Predicate needs at least one table!")
|
|
2340
|
+
from_clause_str = ", ".join(str(tab) for tab in tables)
|
|
2341
|
+
predicate_str = json_data["predicate"]
|
|
2342
|
+
emulated_query = f"SELECT * FROM {from_clause_str} WHERE {predicate_str}"
|
|
2343
|
+
parsed_query = parse_query(emulated_query)
|
|
2344
|
+
return parsed_query.where_clause.predicate
|